diff --git a/drivers/cpp/models/cuda-driver.cu b/drivers/cpp/models/cuda-driver.cu
index 4fb7233..27fb64e 100644
--- a/drivers/cpp/models/cuda-driver.cu
+++ b/drivers/cpp/models/cuda-driver.cu
@@ -83,6 +83,14 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+    if (!isValid) {
+        destroy(ctx);
+        return 0;
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     CudaTimer timer;
@@ -110,10 +118,6 @@ int main(int argc, char **argv) {
     }
     printf("BestSequential: %.*f\n", DBL_DIG-1, totalTime / NITER);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/hip-driver.cu b/drivers/cpp/models/hip-driver.cu
index 0da73ea..e09042a 100644
--- a/drivers/cpp/models/hip-driver.cu
+++ b/drivers/cpp/models/hip-driver.cu
@@ -83,6 +83,14 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+    if (!isValid) {
+        destroy(ctx);
+        return 0;
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     HipTimer timer;
@@ -110,10 +118,6 @@ int main(int argc, char **argv) {
     }
     printf("BestSequential: %.*f\n", DBL_DIG-1, totalTime / NITER);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/kokkos-driver.cc b/drivers/cpp/models/kokkos-driver.cc
index 1d881f3..85ed295 100644
--- a/drivers/cpp/models/kokkos-driver.cc
+++ b/drivers/cpp/models/kokkos-driver.cc
@@ -44,6 +44,15 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+    if (!isValid) {
+        destroy(ctx);
+        Kokkos::finalize();
+        return 0;
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     Kokkos::Timer timer;
@@ -67,10 +76,6 @@ int main(int argc, char **argv) {
     }
     printf("BestSequential: %.*f\n", DBL_DIG-1, totalTime / NITER);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/mpi-driver.cc b/drivers/cpp/models/mpi-driver.cc
index e242dba..441f0ff 100644
--- a/drivers/cpp/models/mpi-driver.cc
+++ b/drivers/cpp/models/mpi-driver.cc
@@ -52,6 +52,18 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    if (rank == 0) {
+        printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+
+        if (!isValid) {
+            destroy(ctx);
+            MPI_Abort(MPI_COMM_WORLD, 0);
+            return 0;
+        }
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     for (int i = 0; i < NITER; i += 1) {
@@ -85,12 +97,6 @@ int main(int argc, char **argv) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    if (rank == 0) {
-        printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-    }
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/mpi-omp-driver.cc b/drivers/cpp/models/mpi-omp-driver.cc
index e20558d..7ad24a1 100644
--- a/drivers/cpp/models/mpi-omp-driver.cc
+++ b/drivers/cpp/models/mpi-omp-driver.cc
@@ -55,6 +55,18 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    if (rank == 0) {
+        printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+
+        if (!isValid) {
+            destroy(ctx);
+            MPI_Abort(MPI_COMM_WORLD, 0);
+            return 0;
+        }
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     for (int i = 0; i < NITER; i += 1) {
@@ -88,12 +100,6 @@ int main(int argc, char **argv) {
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    if (rank == 0) {
-        printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-    }
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/omp-driver.cc b/drivers/cpp/models/omp-driver.cc
index 2896cf3..e5d0041 100644
--- a/drivers/cpp/models/omp-driver.cc
+++ b/drivers/cpp/models/omp-driver.cc
@@ -47,6 +47,14 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+    if (!isValid) {
+        destroy(ctx);
+        return 0;
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     for (int i = 0; i < NITER; i += 1) {
@@ -69,10 +77,6 @@ int main(int argc, char **argv) {
     }
     printf("BestSequential: %.*f\n", DBL_DIG-1, totalTime / NITER);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/cpp/models/serial-driver.cc b/drivers/cpp/models/serial-driver.cc
index fda6dcd..80b5ff6 100644
--- a/drivers/cpp/models/serial-driver.cc
+++ b/drivers/cpp/models/serial-driver.cc
@@ -44,6 +44,14 @@ int main(int argc, char **argv) {
     /* initialize */
     Context *ctx = init();
 
+    /* validate */
+    const bool isValid = validate(ctx);
+    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
+    if (!isValid) {
+        destroy(ctx);
+        return 0;
+    }
+
     /* benchmark */
     double totalTime = 0.0;
     for (int i = 0; i < NITER; i += 1) {
@@ -68,10 +76,6 @@ int main(int argc, char **argv) {
     }
     printf("BestSequential: %.*f\n", DBL_DIG-1, totalTime / NITER);
 
-    /* validate */
-    const bool isValid = validate(ctx);
-    printf("Validation: %s\n", isValid ? "PASS" : "FAIL");
-
     /* cleanup */
     destroy(ctx);
 
diff --git a/drivers/driver_wrapper.py b/drivers/driver_wrapper.py
index 2a86110..6c79611 100644
--- a/drivers/driver_wrapper.py
+++ b/drivers/driver_wrapper.py
@@ -113,7 +113,7 @@ def are_all_valid(self) -> bool:
     def best_sequential_runtime(self) -> Optional[float]:
         """ Return the min value for sequential runtime. """
         if self.did_build() and self.did_any_run():
-            return min(r.best_sequential_runtime for r in self.run_outputs if r.best_sequential_runtime is not None)
+            return min((r.best_sequential_runtime for r in self.run_outputs if r.best_sequential_runtime is not None), default=None)
         else:
             return None
 
diff --git a/drivers/run-cuda.sbatch b/drivers/run-cuda.sbatch
index 51300a1..ee777a8 100644
--- a/drivers/run-cuda.sbatch
+++ b/drivers/run-cuda.sbatch
@@ -4,12 +4,12 @@
 #SBATCH --gpus=a100:1
 #SBATCH -t 05:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J cuda-codellama-34b-hf_prompted
-#SBATCH -o run-outputs/codellama-34b-hf_prompted-cuda-%A.out
+#SBATCH -J cuda-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-cuda-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="codellama-34b-hf_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -20,7 +20,7 @@ module purge
 ml python gcc/11.3.0 cuda/12.1.1/gcc/11.3.0/
 
 python run-all.py \
-    $OUTPUT \
+    $GENERATED_PROMPTS \
     --output $OUTPUT \
     --scratch-dir $SCRATCH_DIR \
     --launch-configs launch-configs.json \
@@ -28,6 +28,4 @@ python run-all.py \
     --yes-to-all \
     --include-models cuda \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
     --log info
\ No newline at end of file
diff --git a/drivers/run-hip.sbatch b/drivers/run-hip.sbatch
index 1379a63..6057072 100644
--- a/drivers/run-hip.sbatch
+++ b/drivers/run-hip.sbatch
@@ -1,12 +1,12 @@
 #!/bin/bash
 #SBATCH -N 1
-#SBATCH -t 05:00:00
-#SBATCH -J hip-codellama-34b-hf_prompted
-#SBATCH -o run-outputs/codellama-34b-hf_prompted-hip-{{id}}.out
+#SBATCH -t 08:00:00
+#SBATCH -J hip-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-hip-{{id}}.out
 
 # settings
-HASH="a8724ee8"
-MODEL="codellama-34b-hf_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -17,7 +17,7 @@ module purge
 ml python rocm/5.7.0 flux_wrappers/0.1
 
 python run-all.py \
-    $OUTPUT \
+    $GENERATED_PROMPTS \
     --output $OUTPUT \
     --scratch-dir $SCRATCH_DIR \
     --launch-configs launch-configs.json \
@@ -25,6 +25,4 @@ python run-all.py \
     --yes-to-all \
     --include-models hip \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
     --log info
\ No newline at end of file
diff --git a/drivers/run-kokkos.sbatch b/drivers/run-kokkos.sbatch
index e26f06f..ce001e0 100644
--- a/drivers/run-kokkos.sbatch
+++ b/drivers/run-kokkos.sbatch
@@ -2,14 +2,14 @@
 #SBATCH -N 1
 #SBATCH --exclusive
 #SBATCH -p serial
-#SBATCH -t 08:00:00
+#SBATCH -t 05:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J kokkos-codellama-34b-hf_prompted
-#SBATCH -o run-outputs/codellama-34b-hf_prompted-kokkos-%A.out
+#SBATCH -J kokkos-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-kokkos-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="codellama-34b-hf_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -22,7 +22,7 @@ export OMP_PROC_BIND=spread
 export OMP_PLACES=cores
 
 python run-all.py \
-    $OUTPUT \
+    $GENERATED_PROMPTS \
     --output $OUTPUT \
     --scratch-dir $SCRATCH_DIR \
     --launch-configs launch-configs.json \
@@ -30,6 +30,4 @@ python run-all.py \
     --yes-to-all \
     --include-models kokkos \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
     --log info
\ No newline at end of file
diff --git a/drivers/run-mpi+omp.sbatch b/drivers/run-mpi+omp.sbatch
index f8f2f1d..9565de7 100644
--- a/drivers/run-mpi+omp.sbatch
+++ b/drivers/run-mpi+omp.sbatch
@@ -1,14 +1,14 @@
 #!/bin/bash
 #SBATCH -N 4
 #SBATCH --exclusive
-#SBATCH -t 06:00:00
+#SBATCH -t 05:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J mpi+omp-codellama-34b-hf_prompted
-#SBATCH -o run-outputs/codellama-34b-hf_prompted-mpi+omp-%A.out
+#SBATCH -J mpi+omp-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-mpi+omp-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="codellama-34b-hf_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -22,7 +22,7 @@ export OMP_PLACES=cores
 export OMPI_MCA_opal_warn_on_missing_libcuda=0
 
 python run-all.py \
-    $OUTPUT \
+    $GENERATED_PROMPTS \
     --output $OUTPUT \
     --scratch-dir $SCRATCH_DIR \
     --launch-configs launch-configs.json \
@@ -30,7 +30,5 @@ python run-all.py \
     --yes-to-all \
     --include-models mpi+omp \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
     --run-timeout 60 \
     --log info
\ No newline at end of file
diff --git a/drivers/run-mpi.sbatch b/drivers/run-mpi.sbatch
index 91a9f1b..de4b5e8 100644
--- a/drivers/run-mpi.sbatch
+++ b/drivers/run-mpi.sbatch
@@ -1,13 +1,13 @@
 #!/bin/bash
 #SBATCH -n 512
-#SBATCH -t 08:00:00
+#SBATCH -t 04:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J mpi-codellama-34b-hf_prompted
-#SBATCH -o run-outputs/codellama-34b-hf_prompted-mpi-%A.out
+#SBATCH -J mpi-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-mpi-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="codellama-34b-hf_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -27,7 +27,5 @@ python run-all.py \
     --yes-to-all \
     --include-models mpi \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
     --run-timeout 60 \
     --log info
\ No newline at end of file
diff --git a/drivers/run-omp.sbatch b/drivers/run-omp.sbatch
index 703c9b2..50b28f7 100644
--- a/drivers/run-omp.sbatch
+++ b/drivers/run-omp.sbatch
@@ -2,14 +2,14 @@
 #SBATCH -N 1
 #SBATCH --exclusive
 #SBATCH -p serial
-#SBATCH -t 08:00:00
+#SBATCH -t 05:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J omp-phind-v2_prompted
-#SBATCH -o run-outputs/phind-v2_prompted-omp-%A.out
+#SBATCH -J omp-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-omp-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="phind-v2_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
@@ -22,7 +22,7 @@ export OMP_PROC_BIND=spread
 export OMP_PLACES=cores
 
 python run-all.py \
-    $OUTPUT \
+    $GENERATED_PROMPTS \
     --output $OUTPUT \
     --scratch-dir $SCRATCH_DIR \
     --launch-configs launch-configs.json \
@@ -30,6 +30,4 @@ python run-all.py \
     --yes-to-all \
     --include-models omp \
     --early-exit-runs \
-    --problem-type geometry \
-    --overwrite \
-    --log debug
\ No newline at end of file
+    --log info
\ No newline at end of file
diff --git a/drivers/run-serial.sbatch b/drivers/run-serial.sbatch
index 9d8d2ec..a4e83ff 100644
--- a/drivers/run-serial.sbatch
+++ b/drivers/run-serial.sbatch
@@ -2,14 +2,14 @@
 #SBATCH -N 1
 #SBATCH --exclusive
 #SBATCH -p serial
-#SBATCH -t 05:00:00
+#SBATCH -t 04:00:00
 #SBATCH -A bhatele-lab-cmsc
-#SBATCH -J serial-phind-v2_prompted
-#SBATCH -o run-outputs/phind-v2_prompted-serial-%A.out
+#SBATCH -J serial-gemini-pro
+#SBATCH -o run-outputs/gemini-pro-serial-%A.out
 
 # settings
-HASH="a8724ee8"
-MODEL="phind-v2_prompted"
+HASH="015cff6f"
+MODEL="gemini-pro"
 TEMP="0.2"
 
 GENERATED_PROMPTS="../results/${HASH}/${MODEL}_temp${TEMP}/all.json"
diff --git a/generate/generate-gemini.py b/generate/generate-gemini.py
new file mode 100644
index 0000000..27e80f0
--- /dev/null
+++ b/generate/generate-gemini.py
@@ -0,0 +1,246 @@
+""" Get the model outputs from Google's AI api.
+    author: Daniel Nichols
+    date: February 2024
+"""
+# std imports
+from argparse import ArgumentParser
+import json
+import os
+import re
+import time
+from typing import Optional
+
+# tpl imports
+from alive_progress import alive_bar
+import google.generativeai as genai
+
+""" Prompt template: """
+SYSTEM_TEMPLATE = """You are a helpful coding assistant.
+You are helping a programmer write a C++ function. Write the body of the function and put it in a markdown code block.
+Do not write any other code or explanations.
+"""
+
+PROMPT_TEMPLATE = """Complete the C++ function {function_name}. Only write the body of the function {function_name}.
+
+```cpp
+{prompt}
+```
+"""
+
+
+def get_args():
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument("-m", "--model", choices=["gemini-1.0-pro"], required=True, help="The model to use.")
+    parser.add_argument("-p", "--prompts", type=str, required=True, help="Path to prompts json")
+    parser.add_argument("-o", "--output", type=str, required=True, help="Path to output json")
+    parser.add_argument("--api-key", type=str, help="Google AI API key. " +
+        "If not provided, then uses environment variable GOOGLE_API_KEY.")
+    parser.add_argument("--max-requests", type=int, help="If provided, then only makes this many requests.")
+    parser.add_argument("--max-tokens-per-second", help="Limit the rate of token generation.")
+    parser.add_argument("--max-requests-per-second", help="Limit the rate of request generation.")
+    parser.add_argument("--dry", action="store_true", help="If provided, then don't make any requests.")
+    parser.add_argument("--overwrite", action="store_true", help="If provided, then overwrite outputs already in file.")
+    parser.add_argument("--temperature", type=float, default=0.2, help="The temperature to use for sampling.")
+    parser.add_argument("--top-p", type=float, default=0.95, help="The top p to use for sampling.")
+    parser.add_argument("--max-new-tokens", type=int, default=1024, help="The maximum number of tokens to generate.")
+    parser.add_argument("--num-samples-per-prompt", type=int, default=20, help="The number of samples to generate " +
+        "per prompt.")
+    return parser.parse_args()
+
+
+def get_env_var(name: str) -> str:
+    """ Get an environment variable. """
+    if name not in os.environ:
+        raise ValueError(f"Environment variable {name} not set.")
+    return os.environ[name]
+
+GPU_FUNCTION_NAME_PATTERN = re.compile(r"__global__ void ([a-zA-Z0-9_]+)\(")
+CPU_FUNCTION_NAME_PATTERN = re.compile(r"\s*[a-zA-Z_]+ ([a-zA-Z0-9_]+)\(")
+def get_function_name(prompt: str, execution_model: str) -> str:
+    if execution_model in ['cuda', 'hip']:
+        match = GPU_FUNCTION_NAME_PATTERN.match(prompt.splitlines()[-1])
+    else:
+        match = CPU_FUNCTION_NAME_PATTERN.match(prompt.splitlines()[-1])
+    if match is None:
+        raise ValueError(f"Could not find function name in prompt: {prompt}")
+    return match.group(1)
+
+def get_max_tokens_per_second(model: str) -> Optional[int]:
+    """ rates limites as of January 2024 """
+    if model == "gemini-1.0-pro":
+        tokens_per_minute = 2048 * 60
+        return tokens_per_minute / 60
+    else:
+        return None
+    
+def get_max_requests_per_second(model: str) -> Optional[int]:
+    """ rates limites as of January 2024 """
+    if model == "gemini-1.0-pro":
+        requests_per_minute = 60
+        return requests_per_minute / 60
+    else:
+        return None
+
+def get_max_requests_per_day(model: str) -> Optional[int]:
+    """ rates limites as of January 2024 """
+    if model == "gemini-1.0-pro":
+        return 60 * 60 * 24
+    else:
+        return None
+
+def postprocess(prompt: str, output: str) -> str:
+    """ Postprocess the output. """
+    # remove leading ```, ```cpp, and trailing ```
+    output = output.strip().removeprefix("```cpp").removeprefix("```").removesuffix("```")
+
+    # remove prompt if it included it
+    if output.startswith(prompt):
+        output = output[len(prompt):]
+
+    return output
+
+def main():
+    args = get_args()
+
+    # get the prompts
+    with open(args.prompts, 'r') as prompts_json:
+        prompts = json.load(prompts_json)
+
+    # read in outputs
+    if not args.overwrite and os.path.exists(args.output):
+        with open(args.output, 'r') as output_json:
+            outputs = json.load(output_json)
+
+        # copy existing outputs into prompts
+        copy_count = 0
+        for prompt in prompts:
+            for o in outputs:
+                if o["prompt"] == prompt["prompt"] and \
+                   o["name"] == prompt["name"] and \
+                   o["parallelism_model"] == prompt["parallelism_model"] and \
+                   "outputs" in o and \
+                   len(o["outputs"]) == args.num_samples_per_prompt and \
+                   o["temperature"] == args.temperature and \
+                   o["top_p"] == args.top_p:
+                    for col in ["temperature", "top_p", "do_sample", "max_new_tokens", "outputs"]:
+                        prompt[col] = o[col]
+                    copy_count += 1
+                    break
+        print(f"Copied {copy_count} existing outputs.")
+
+    # get the keys
+    api_key = args.api_key or get_env_var("GOOGLE_API_KEY")
+    genai.configure(api_key=api_key)
+
+    # create the client
+    config = genai.types.GenerationConfig(
+        candidate_count=1,
+        max_output_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p
+    )
+    safety_settings = [
+        {
+            "category": "HARM_CATEGORY_DANGEROUS",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+    model = genai.GenerativeModel(args.model, generation_config=config, safety_settings=safety_settings)
+
+    # generation metadata
+    MAX_TOKENS_PER_SECOND = args.max_tokens_per_second or get_max_tokens_per_second(args.model)
+    MAX_REQUESTS_PER_SECOND = args.max_requests_per_second or get_max_requests_per_second(args.model)
+    MAX_REQUESTS = args.max_requests or get_max_requests_per_day(args.model)
+
+    # generate outputs
+    request_counter = 0
+    request_rate_counter = 0
+    request_timer = time.time()
+    with alive_bar(len(prompts), title="Generating outputs", dual_line=True) as bar:
+        for prompt in prompts:
+            # see if we can skip this
+            if not args.overwrite and "outputs" in prompt:
+                bar(skipped=True)
+                continue
+
+            # get the prompt
+            original_prompt = prompt["prompt"]
+            function_name = get_function_name(original_prompt, prompt["parallelism_model"])
+            prompt_text = PROMPT_TEMPLATE.format(prompt=original_prompt, function_name=function_name)
+
+            # generate the outputs
+            if args.dry:
+                print("system", SYSTEM_TEMPLATE)
+                print("prompt", prompt_text)
+                continue
+
+            # set metadata
+            prompt["temperature"] = args.temperature
+            prompt["top_p"] = args.top_p
+            prompt["do_sample"] = True
+            prompt["max_new_tokens"] = args.max_new_tokens
+
+            # generate the outputs
+            completions = []
+            while len(completions) < args.num_samples_per_prompt:
+                completion = model.generate_content(SYSTEM_TEMPLATE + "\n" + prompt_text)
+                if completion.candidates[0].finish_reason == 1: # STOP
+                    completions.append(completion)
+                    bar.text(f"~> Received output {len(completions)} of {args.num_samples_per_prompt}.")
+                else:
+                    print(f"Got a completion with finish_reason={completion.candidates[0].finish_reason}.")
+                    time.sleep(5)
+
+            outputs = [c.text for c in completions]
+            outputs = [postprocess(original_prompt, o) for o in outputs]
+            prompt["outputs"] = outputs
+            bar()
+
+            # update counters
+            request_counter += 1
+            request_rate_counter += 1
+
+            # check if we should stop
+            if MAX_REQUESTS is not None and request_counter >= MAX_REQUESTS:
+                print(f"Stopping after {request_counter} requests.")
+                break
+        
+            # check if we should sleep
+            requests_per_second = request_rate_counter / (time.time() - request_timer)
+            if MAX_REQUESTS_PER_SECOND is not None and requests_per_second > (MAX_REQUESTS_PER_SECOND*0.95):
+                sleep_time = 5
+                print(f"Sleeping for {sleep_time} seconds.")
+                time.sleep(sleep_time)
+                request_timer = time.time()
+                request_rate_counter = 0
+
+            # write intermediate outputs
+            with open(args.output, 'w') as output_json:
+                json.dump(prompts, output_json, indent=2)
+
+    # summary stats
+    print(f"Submitted {request_counter} requests.")
+
+    # write outputs
+    with open(args.output, 'w') as output_json:
+        json.dump(prompts, output_json, indent=2)
+    
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/outputs/output_015cff6f_gemini-pro.json b/outputs/output_015cff6f_gemini-pro.json
new file mode 100644
index 0000000..c75989d
--- /dev/null
+++ b/outputs/output_015cff6f_gemini-pro.json
@@ -0,0 +1,13862 @@
+[
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "0.0)",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+      "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+      "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "<",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+      "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "&"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+      "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+      "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+      "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "&",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "serial",
+    "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "(",
+      "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+      "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "hip",
+    "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "losest",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+      "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "losest",
+      "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "cuda",
+    "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+      "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+      "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+      "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "(1.21,12.12.1222.12.12.",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "<p1, p2>",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+      "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "(1.41421, 1.41421)",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+      "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+      "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+      "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+      "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+      ".",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+      "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+      "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+      "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+      "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+      "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "(",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+      "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+      "Hull: 1.  Synag",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+      "Hull.2)",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+      "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+      "Kokok",
+      "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+      "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+      "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+      "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+      "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+      "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+      "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+      "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+      "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+      "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+      "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+      "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+      "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+      "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+      "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+      "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+      "(",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "_t.",
+      "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+      "_t.",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+      "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+      "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+      "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+      "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+      "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+      "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+      "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+      "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+      "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+      "_",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      ":",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+      "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "&"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+      "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+      "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "_t>",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+      "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "Degree)",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "Degree,",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+      "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+      "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+      "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+      "=",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+      "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+      "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+      "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "_t = 0, 1, 2, 3",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+      "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+      "&",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+      "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+      ": 1",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "smallest&",
+      "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "(",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "(",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+      "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "_",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "&",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "hip",
+    "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+      ".",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+      "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+      "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+      "Kokkoss",
+      "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      ".",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ",  Synagogues_",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+      "{",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+      "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+      "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+      "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+      "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+      "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+      "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+      "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+      "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+      ")",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "(",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      ")",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+      "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      ".",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+      "(",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+      "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+      "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+      "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+      "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+      "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+      "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+      "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+      "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+      "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "(",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "<x>",
+      "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+      "<,  Synag",
+      "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+      "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "ontiguous, Synag",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+      "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "(4, -1, 2, 1) = 6.",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+      "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+      "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "Kok",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "<",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "<",
+      "<",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "(",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "_t",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      ":",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "serial",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "negate,  Synag",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "negate,  Synag",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "hip",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "E",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "cuda",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      ",",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "serial",
+    "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "hip",
+    "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      ", 0, 9, 7, 15, 64, 3]",
+      ", 0, 9, 7, 15, 64, 3]",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "cuda",
+    "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+      "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      ":",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+      "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+      "(",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+      "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+      "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+      "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+      ":",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+      "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+      "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+      "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+      "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+      "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+      "{"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+      "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      ".",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+      ",",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+      "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ",",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      ",",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "_t,",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+      "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      ".",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      ".",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+      "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "&",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+      "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "<}",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+      "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+      "<=====,",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+      "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+      "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+      "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "<,",
+      "_t* 0."
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+      "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "serial",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+      "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+      ".",
+      "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+      "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+      "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "hip",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+      "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+      "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "cuda",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+      "sparseSparseArray(1,",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+      "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+      "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "serial",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+      "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "hip",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "cuda",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "serial",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "hip",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "cuda",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ", 10]",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+      "{x=1.5,y=1.1}",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+      "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "(",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "(4)",
+      "(4)",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "(",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+      "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+      "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      ", 4]",
+      "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ".",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+      "_",
+      "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+      "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+      "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+      "&",
+      "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+      "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+      "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+      "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+      "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+      "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+      "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "(",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+      "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "_",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "serial",
+    "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "<",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "&",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "hip",
+    "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      ", 8, 2, 6, 4, 6",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+      "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "(",
+      "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+      "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+      "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+      "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+      "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+      "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+      "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+      "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+      "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+      "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+      "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "serial",
+    "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "hip",
+    "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+      ".",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+      "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+      "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+      "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+      "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+      "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+      "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+      "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+      "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "_",
+      "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+      "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+      "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "serial",
+    "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "hip",
+    "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "_",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "cuda",
+    "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "_",
+      "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+      "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+      "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      ": 2",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "losest",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+      "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+      "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+      "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+      "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+      "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "fft, 4,",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+      "fft:",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+      "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+      "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+      "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+      "fft,"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+      "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+      "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+      "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+      ".",
+      "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "fft{4,0}",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      ".4.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+      "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "-",
+      "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+      "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+      "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+      "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "serial",
+    "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "(121:1:1:1:1:1:1:",
+      "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      ")",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "(x,y)",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+      "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+      "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+      "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+      "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+      "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+      "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      ":",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "(x,y,z,t)",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "{,",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "(",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+      "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+      "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "_t",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+      "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "serial",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+      "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+      "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "hip",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+      "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "_t*",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "cuda",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      ":",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "serial",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "hip",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "cuda",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "(",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "mv",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "serial",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "hip",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "cuda",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      ".",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "serial",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "hip",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "cuda",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "(",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "(5.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+      "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+      "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+      "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "(",
+      "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "ranks",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "serial",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+      "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "hip",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+      "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+      "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "cuda",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "(",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+      "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+      "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+      "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      ".",
+      "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "serial",
+    "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "hip",
+    "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+      "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+      "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+      "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "cuda",
+    "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      ".",
+      "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+      "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "Ignore the zero value in the list.",
+      "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+      "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+      "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+      "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+      "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+      "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "serial",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "=4",
+      "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+      "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+      "K",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      ".",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      ".",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "serial",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "<*>",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "hip",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+      "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      ".",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "cuda",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      ".",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+      "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "(x)",
+      "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "serial",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      ":",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "hip",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+      "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+      "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+      "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+      "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+      "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+      "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "cuda",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+      "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+      "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+      "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+      "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+      "Kokkoss_t_t",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "serial",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "hip",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "serial",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "hip",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "cuda",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "_",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "_t",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+      "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+      "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+      "Kokkoss vector space.",
+      "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+      "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+      "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+      "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "_",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "serial",
+    "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "hip",
+    "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+      "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+      "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "{",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "cuda",
+    "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+      "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "Kokkosspecific",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+      "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      ",",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+      "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/all.json b/results/015cff6f/gemini-pro_temp0.2/all.json
new file mode 100644
index 0000000..c75989d
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/all.json
@@ -0,0 +1,13862 @@
+[
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "0.0)",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+      "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+      "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "<",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+      "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+      "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+      "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "&"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+      "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+      "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "12_geometry_smallest_triangle",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+      "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+      "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+      "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+      "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "&",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "serial",
+    "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+      "(",
+      "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+      "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "hip",
+    "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "losest",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+      "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+      "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "losest",
+      "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "cuda",
+    "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+      "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+      "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "14_geometry_closest_pair_1d",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+      "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+      "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+      "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+      "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+      "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+      "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "(1.21,12.12.1222.12.12.",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "<p1, p2>",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+      "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "13_geometry_closest_pair_2d",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "(1.41421, 1.41421)",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+      "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+      "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+      "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+      "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+      "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+      ".",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+      "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+      "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+      "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+      "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+      "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+      "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "10_geometry_convex_hull",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "(",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+      "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+      "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+      "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+      "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+      "Hull: 1.  Synag",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+      "Hull.2)",
+      "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "geometry",
+    "language": "cpp",
+    "name": "11_geometry_convex_hull_perimeter",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+      "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+      "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+      "Kokok",
+      "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+      "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+      "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+      "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+      "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+      "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+      "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+      "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+      "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+      "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+      "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+      "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+      "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+      "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+      "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+      "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+      "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+      "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+      "(",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "_t.",
+      "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+      "_t.",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+      "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+      "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+      "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+      "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+      "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+      "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+      "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+      "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+      "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "16_graph_largest_component",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+      "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+      "_",
+      "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+      "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+      "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      ":",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+      "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+      "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+      "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+      "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "18_graph_count_components",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "&"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+      "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+      "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+      "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+      "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+      "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "15_graph_edge_count",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+      "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+      "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+      "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+      "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "_t>",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+      "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+      "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+      "Degree)",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+      "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+      "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+      "Degree,",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+      "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+      "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "17_graph_highest_degree",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+      "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+      "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+      "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+      "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+      "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+      "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+      "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+      "=",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+      "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+      "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+      "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+      "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "_t = 0, 1, 2, 3",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+      "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "graph",
+    "language": "cpp",
+    "name": "19_graph_shortest_path",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+      "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+      "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+      "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+      "&",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+      "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+      "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+      "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+      ": 1",
+      "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "smallest&",
+      "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "(",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "(",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "28_reduce_smallest_odd_number",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+      "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+      "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+      "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+      "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+      "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+      "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "_",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+      "&",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+      "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "hip",
+    "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+      ".",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "26_reduce_product_of_inverses",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+      "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+      "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+      "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+      "Kokkoss",
+      "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+      "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      ".",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ",  Synagogues_",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+      "{",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "29_reduce_sum_of_min_of_pairs",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+      "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+      "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+      "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+      "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+      "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+      "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+      "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "27_reduce_average",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+      "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+      "&",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+      "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+      "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+      "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+      "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+      "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+      "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+      ")",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "(",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      ")",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+      "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+      "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "reduce",
+    "language": "cpp",
+    "name": "25_reduce_xor",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+      "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+      ".",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+      "(",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "33_scan_reverse_prefix_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+      "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+      "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+      "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+      "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+      "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+      "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+      "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+      "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+      "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+      "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+      "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+      "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+      "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "(",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+      "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "32_scan_sum_of_prefix_sum_array",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "<x>",
+      "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+      "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "31_scan_scan_with_min_function",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+      "<,  Synag",
+      "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+      "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+      "ontiguous, Synag",
+      "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+      "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "(4, -1, 2, 1) = 6.",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+      "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+      "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "34_scan_largest_contiguous_subarray_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+      "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+      "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+      "Kok",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "<",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "<",
+      "<",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+      "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "(",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+      "_t",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "scan",
+    "language": "cpp",
+    "name": "30_scan_prefix_sum",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      ":",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "serial",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "negate,  Synag",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+      "negate,  Synag",
+      "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+      "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "hip",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "E",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "cuda",
+    "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "56_transform_negate_odds",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+      "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+      "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+      ",",
+      "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "serial",
+    "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "hip",
+    "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      ", 0, 9, 7, 15, 64, 3]",
+      ", 0, 9, 7, 15, 64, 3]",
+      "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "cuda",
+    "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+      "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "59_transform_map_function",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+      "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+      "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+      "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+      "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "57_transform_inverse_offset",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "55_transform_relu",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "serial",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      ":",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+      "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+      "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "hip",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+      "(",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "cuda",
+    "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "transform",
+    "language": "cpp",
+    "name": "58_transform_squaring",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+      "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+      "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+      "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+      "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+      "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+      ":",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+      "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+      "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+      "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+      "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+      "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+      "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+      "{"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "47_sparse_la_spmv",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+      "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+      "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      ".",
+      "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+      ",",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+      "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ",",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "46_sparse_la_spmm",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      ",",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "_t,",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+      "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      ".",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+      "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+      "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+      "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      ".",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+      "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "45_sparse_la_sparse_solve",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "&",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+      "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "<}",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+      "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+      "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+      "<=====,",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "serial",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+      "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+      "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+      "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "hip",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+      "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+      "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+      "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+      "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "<,",
+      "_t* 0."
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "cuda",
+    "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+      "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+      "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "49_sparse_la_sparse_lu_decomp",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+      "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "serial",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+      "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+      ".",
+      "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+      "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+      "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+      "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+      "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+      "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+      "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "hip",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+      "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+      "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "cuda",
+    "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+      "sparseSparseArray(1,",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sparse_la",
+    "language": "cpp",
+    "name": "48_sparse_la_sparse_axpy",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+      "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+      "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+      "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+      "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "serial",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+      "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "hip",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "cuda",
+    "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "23_histogram_first_letter_counts",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "serial",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "hip",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ":",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "cuda",
+    "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "21_histogram_bin_0-100",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ", 10]",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+      "{x=1.5,y=1.1}",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+      "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+      "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+      "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "serial",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+      "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+      "(",
+      "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "hip",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "(4)",
+      "(4)",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "(",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "cuda",
+    "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+      "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "22_histogram_count_quadrants",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+      "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+      "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+      "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+      "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      ", 4]",
+      "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "24_histogram_count_quartile",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "serial",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+      "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+      "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "hip",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "cuda",
+    "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "histogram",
+    "language": "cpp",
+    "name": "20_histogram_pixel_histogram",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      ".",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+      "_",
+      "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+      "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+      "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+      "&",
+      "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+      "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+      "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+      "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+      "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+      "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+      "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+      "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+      "(",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "38_search_find_the_first_even_number",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+      "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+      "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "_",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+      "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "serial",
+    "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "<",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "&",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "hip",
+    "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      ", 8, 2, 6, 4, 6",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+      "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "36_search_check_if_array_contains_value",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "(",
+      "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+      "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+      "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+      "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+      "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+      "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+      "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+      "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+      "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+      "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+      "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+      "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+      "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+      "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+      "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "serial",
+    "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+      "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "hip",
+    "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+      ".",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+      "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+      "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "39_search_xor_contains",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+      "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+      "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+      "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+      "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+      "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+      "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+      "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+      "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+      "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+      "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+      "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+      "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+      "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+      "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "_",
+      "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+      "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+      "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+      "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "serial",
+    "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+      "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "hip",
+    "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "_",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "cuda",
+    "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "35_search_search_for_last_struct_by_key",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+      "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+      "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+      "_",
+      "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+      "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+      "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+      "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+      "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+      "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "serial",
+    "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      ": 2",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "losest",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+      "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+      "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "search",
+    "language": "cpp",
+    "name": "37_search_find_the_closest_number_to_pi",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+      "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+      "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+      "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+      "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+      "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+      "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+      "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "fft, 4,",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+      "fft:",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+      "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+      "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+      "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+      "fft,"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+      "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+      "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+      "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "08_fft_split_fft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+      "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+      "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+      "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+      ".",
+      "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+      "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "fft{4,0}",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "07_fft_fft_conjugate",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      ".4.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+      "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "-",
+      "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+      "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+      "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+      "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "serial",
+    "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+      "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "05_fft_inverse_fft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+      "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+      "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "(121:1:1:1:1:1:1:",
+      "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      ")",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "(x,y)",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+      "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+      "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "06_fft_dft",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+      "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+      "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+      "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+      "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+      "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+      "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "<",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+      "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+      ":",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "fft",
+    "language": "cpp",
+    "name": "09_fft_fft_out_of_place",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "(x,y,z,t)",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      ".",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "{,",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "03_dense_la_axpy",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "(",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+      "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+      "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+      "_t",
+      "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+      "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+      "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "serial",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+      "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+      "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "hip",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+      "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "_t*",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "cuda",
+    "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      ":",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "01_dense_la_solve",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "serial",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "hip",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "cuda",
+    "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+      "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "(",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "02_dense_la_gemm",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "mv",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "serial",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "hip",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+      "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+      "(",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "cuda",
+    "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "04_dense_la_gemv",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      ".",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+      "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "serial",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "hip",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+      "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "cuda",
+    "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "(",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "dense_la",
+    "language": "cpp",
+    "name": "00_dense_la_lu_decomp",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "(5.",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+      "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+      "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+      "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+      "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "(",
+      "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "ranks",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "(",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+      "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "serial",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+      "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "hip",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+      "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+      "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "cuda",
+    "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+      "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+      "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "42_sort_sorted_ranks",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "(",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+      "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+      "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+      "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      ".",
+      "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+      "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "serial",
+    "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+      "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "hip",
+    "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+      "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+      "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+      "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+      "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+      "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "cuda",
+    "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      ".",
+      "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+      "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "Ignore the zero value in the list.",
+      "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+      "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+      "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+      "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+      "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "44_sort_sort_non-zero_elements",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+      "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+      "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+      "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+      "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+      "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+      "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+      "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "serial",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+      "=4",
+      "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+      "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+      "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+      "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "hip",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+      "K",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      ".",
+      "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "cuda",
+    "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "41_sort_k-th_smallest_element",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+      "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+      "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+      "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+      "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      ".",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "serial",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "<*>",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+      "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "hip",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+      "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      ".",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "cuda",
+    "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      ".",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "43_sort_sort_an_array_of_structs_by_key",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+      "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+      "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+      "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "(x)",
+      "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+      "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "serial",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      ":",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "hip",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+      "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+      "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+      "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+      "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+      "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+      "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+      "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+      "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+      "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+      "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "cuda",
+    "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+      "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+      "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+      "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+      "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+      "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+      "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+      "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+      "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "sort",
+    "language": "cpp",
+    "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+      "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+      "Kokkoss_t_t",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "serial",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "hip",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "cuda",
+    "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "50_stencil_xor_kernel",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "serial",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "hip",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "cuda",
+    "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "_",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "54_stencil_game_of_life",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+      "_t",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+      "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+      "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+      "Kokkoss vector space.",
+      "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+      "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+      "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+      "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+      "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+      "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "52_stencil_1d_jacobi_3-point_stencil",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+      "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+      "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "_",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "serial",
+    "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "hip",
+    "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+      "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+      "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+      "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "{",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "cuda",
+    "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+      "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+      "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+      "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "51_stencil_edge_kernel",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "kokkos",
+    "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "Kokkosspecific",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+      "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+      "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "omp",
+    "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "serial",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "hip",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      ",",
+      "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "mpi",
+    "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "cuda",
+    "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+      "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+      "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+      "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+    ]
+  },
+  {
+    "problem_type": "stencil",
+    "language": "cpp",
+    "name": "53_stencil_2d_jacobi_5-point_stencil",
+    "parallelism_model": "mpi+omp",
+    "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+    "temperature": 0.2,
+    "top_p": 0.95,
+    "do_sample": true,
+    "max_new_tokens": 1024,
+    "outputs": [
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+      "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+      "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+    ]
+  }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/cuda.json b/results/015cff6f/gemini-pro_temp0.2/cuda.json
new file mode 100644
index 0000000..7b08c3a
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/cuda.json
@@ -0,0 +1,32732 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0832259592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00391841917038
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "Hull.2)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026374354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044987903595
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002736829,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00366309118271
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0036679224,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031460416317
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003527808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00350768642426
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028077766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00349440641403
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026247574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00415569915771
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004042917,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00437429113388
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028127744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044057343483
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027693102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00448926725388
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002659292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00387771520615
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038280332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00450302085876
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0032884584,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00444895362854
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027933596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003374796772
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002785122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449614715576
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003394849,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00351098246574
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0034532274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00339795198441
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004138028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00324414720535
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "Degree,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026177582,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00381198716164
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025754822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00386483845711
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t = 0, 1, 2, 3",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0808154068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050222079754
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0837636752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02315910377502
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0808368656,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050165759325
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0556135174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": ",  Synagogues_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "{",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026018544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11253491210937
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026054006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.54851782226562
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026015018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11094482421875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002613236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.24193276367187
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026601226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11842856445312
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002606509,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.23557421875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002659092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.23355493164062
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025999426,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11372250976562
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026787312,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.23474067382813
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025999244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11239516601563
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026088372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.23330717773437
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002600059,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11204897460938
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026051906,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.24075395507812
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025994716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.11140004882812
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026080622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.1418349609375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026003994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.24182690429687
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026011106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.2347419921875
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0082232372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089080960751
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0082404998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087371518612
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0082385702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088155521154
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0082147114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085205118656
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000977151,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.16704002e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009815614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.343040019e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009768144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.738239959e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009782172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.650239974e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009762832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.964800045e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009767706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.839999989e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000977584,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.072640039e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009772472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.722880065e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009773814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.787200123e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009767504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.783680081e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009777944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.967680007e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009773494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.559359968e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009777142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.936320066e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009774174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.780800074e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000976754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.797120035e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009763818,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.761599928e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000977097,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.907840058e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009784354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.308160044e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009776578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.12128e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009774598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.501119971e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033256236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.634879984e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003328499,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.98272001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033295808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.551360004e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033251982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.612159982e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033293964,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.661440022e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033310438,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.310719915e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033275372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.476479974e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033385578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.741120011e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033234232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.327999972e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033231402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.904959992e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033242208,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.486719996e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033266174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.517440006e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033247158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.496319994e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033250802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.960319951e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033256092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.121599987e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003342265,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.398400012e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033243208,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.607680023e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033240522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.490560006e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033272084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.455999985e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0033245314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.153599933e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014641022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.333439961e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014616278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.939519942e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014617616,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.576639988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014635614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.38912005e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014628522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.753279865e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014630446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.591359988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001461742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.08767999e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001465896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.909759976e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014627238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.697600015e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014629304,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.282880023e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014622168,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.248960026e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014624014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.161280029e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014623972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.696959972e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014626336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.568960004e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001462748,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.59647999e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001462263,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.25536e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014631826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.701440044e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001463327,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.936640032e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001463375,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.978240013e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.439680025e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009820388,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.91903998e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009886268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.764160067e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010425666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.887040034e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001021156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.845440015e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000988224,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.85375993e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010098088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.897279963e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010476098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.653439969e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010767792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.94719997e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010275602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.706560001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010056046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.844160005e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010562182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.861439988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009886546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.890240058e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009925182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.894720018e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010007334,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.898559973e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000979157,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.725120068e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006645732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.716799974e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00066012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.080000006e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006592306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.891519964e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006780264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.137599997e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006591844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.577279955e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006590786,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.155519985e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006607196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.189440019e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0007140724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.248959988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006612004,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.869439915e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006601106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.992959976e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000660435,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.35775996e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006605972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.588800006e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006600066,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.157439999e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006676406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.25568001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006645186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.465920001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006607554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.264959998e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006599144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.158719972e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000659301,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.595839985e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006601286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.816000015e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001515826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10446421661377
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015162186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11091124420166
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015158782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11228229827881
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015134754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10499219818115
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "{",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000875327,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01082643222809
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000875928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01895848312378
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0008769116,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02213273620605
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0008753248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01794167060852
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000877062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01799593582153
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0008759618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01893199996948
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000876088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179400894165
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0008758134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01082663707733
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000876553,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02287251853943
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "sparseSparseArray(1,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002909812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.198080003e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000341737,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.118719995e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002865466,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.968959898e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0003112678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.000319988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002823728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.979199827e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002604574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.192319959e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000297215,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.22495991e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002897952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.075199962e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002827218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.358719885e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061125282,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00167557759285
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061165178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00155772161484
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061108292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191150717735
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061087792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00183470079899
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1192343994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00170546560287
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061149908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016327423811
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061170788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165331199169
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006113518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00161279997826
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061150268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018522495985
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061212726,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163328642845
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061113938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191181440353
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061159328,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191450240612
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061184532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00178737280369
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061234006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168938879967
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006111546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015942719698
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006117327,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00175483520031
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0061122936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191284480095
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006110406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001762246418
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006128885,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00152278397083
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003108164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014321280122
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030972554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015119999945
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003286601,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013148800135
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0032636056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015335040092
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030934782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00011460479945
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030839886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014343680143
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003095208,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001536959976
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031144336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013161599934
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031442642,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014181759655
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0032908108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015372799933
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031516162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00012830719948
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031061242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013322880268
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031148128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015433599949
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637404736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004435199976
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637385904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045820800662
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0636768798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051838719845
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0639383874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048131201267
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637254114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000491852808
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637384218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046035840511
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0636985744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047043200135
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637950866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046497920156
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637311902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045379199982
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637341238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050360959768
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0636891724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051050878763
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0636548522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047666559815
+                    }
+                ]
+            },
+            {
+                "generated_output": ", 4]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.063666502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.80350129394531
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637118954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050759038925
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637119696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041617280245
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0637905002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043255039454
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.063811572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043577599525
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0638127318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044902400374
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018569596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092287999392
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018551744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008928448081
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018473636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081602560282
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018551364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079783039093
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018535892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092279680967
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018497302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077835521698
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018506982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080765440464
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018616864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092412160635
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018495378,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077865599394
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001849205,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080812799931
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018482792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090256000757
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018518804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083926398754
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018552122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077882241011
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018526696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079774720669
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018470146,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087849599123
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018576914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008479616046
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018520524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079781119823
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018596206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092280958891
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00184965,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079781121016
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.84959994e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.123839989e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.270400006e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.406720001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.372160017e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.38e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.290880013e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.391359996e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.223040018e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.016320009e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.42e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.121919975e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.301119998e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.091840025e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.326080002e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.42e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.58720017e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.42e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.418240014e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.84e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.53599997e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.312640011e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.38e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.07520001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.248639971e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.119359992e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.664e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.397440001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.05e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.01762023925781
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.888e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.886720151e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.904e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.451200001e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.524e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.952960022e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.63e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.361599989e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.56e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.619519934e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.866e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.037440032e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.846e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.364799976e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.528e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.984639987e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.54e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.594560012e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.824e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.951040007e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.704e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.967680022e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.686e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.385919988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.904e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.824000031e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.886e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.400639988e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.582e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.603520043e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.68e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.343680002e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.966e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.02278740234375
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "fft{4,0}",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0848829976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00344442877769
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0849057264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00298872318268
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084945489,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300631041527
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0155321342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030240640044
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0154020804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030005759597
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152703138,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058996479511
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152185298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030199680328
+                    }
+                ]
+            },
+            {
+                "generated_output": "{,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152889948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030113280416
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0155963834,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029965440035
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153280986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050481920242
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.015475467,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029911040068
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153442774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059031039476
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.015439297,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056626560688
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.015458928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029845759869
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152797134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030235519409
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153951152,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048088319898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0154632964,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052269439697
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0154348824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029901440144
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153018012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003005120039
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153943578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048197759986
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.014972942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030357120037
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0154571244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030199680328
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035418592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.610239923e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035548876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.558080062e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035933984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.229759946e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035619628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.247039929e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035925288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082329601049
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035386292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.019839987e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035488364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.53696005e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035527274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.749440029e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035435222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.819520026e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035760374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.600959942e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035320104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.701119944e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640924526,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00879431037903
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640987646,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00756073617935
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640831432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00992059497833
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.06403273,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00860442876816
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640351866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00825594234467
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640392604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895784950256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640844996,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00915352306366
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640557816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00945916805267
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0641853718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06221066131592
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.064064083,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00991683216095
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640749494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00896225910187
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640718818,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00825892486572
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.064021531,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00902239360809
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.064055102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00829664001465
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0641118172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746590719223
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.064104878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094600063324
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0640943922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00990727043152
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131951664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01451033611298
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131908302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162784828186
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132047552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0152872959137
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132069448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08103979492188
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132031976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01509706878662
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013203907,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01534448661804
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132157934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158844543457
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131967872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01569454727173
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132113692,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01473971843719
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132071632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01607483482361
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131901228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01554163894653
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131757882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01491079692841
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01319698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08645040435791
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013190688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01469794559479
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131944234,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01412264308929
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132144948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129708990097
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "Ignore the zero value in the list.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0064833768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00879649276733
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0064970182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00957434883118
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0866303454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013914879858
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0893437828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013673599958
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.089430809,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013948799968
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0928930838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013599359989
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0937849678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013976320028
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0870731674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014864639938
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0914637652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016602239907
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0897902558,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014766719937
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0941099554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014127360284
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0870458402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013418880105
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0907643188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016269440055
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.086403986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013601920307
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0911802222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015152640045
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.112346207,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018187520206
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.113831403,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017871359885
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.114221291,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018151040077
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1174799296,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018010879755
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1127780338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018305280209
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1161190356,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001767487973
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1119741944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018606720269
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1158913904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018014719784
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1171265882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018361600041
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1188852528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020634880364
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1184911308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018601599932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1168051876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020794240236
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1122892608,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130691201687
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1142635448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001801791966
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.113198516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018368639946
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1151399654,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018010880053
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1166915124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127723517418
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.118515887,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018043520153
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240533564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020434559882
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.024009407,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020140160322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239704234,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020195840001
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239573452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020181759894
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239247674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020202239752
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240385926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020431360006
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02404669,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020197120309
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240445494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020616319776
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240508892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002020096004
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240165968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020151040256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240659282,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020732800364
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239765874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020266880095
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239888544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020128640234
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023980128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021136639714
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.024114177,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020144640207
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0241090934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020120320022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239967954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020522240102
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023948825,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020165759921
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239637328,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020130559802
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0239445128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020248320103
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0245059046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002198336035
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0245747326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023549440205
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0244232566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090531200171
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0244334398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023731199801
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0244670516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002372608006
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0245540974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023676159978
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/hip.json b/results/015cff6f/gemini-pro_temp0.2/hip.json
new file mode 100644
index 0000000..50209ab
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/hip.json
@@ -0,0 +1,32498 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.08878586730957
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.08219290008545
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.000134655606
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00012835120298
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00013407959566
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.07690810546875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00402050962448
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00399919538498
+                    }
+                ]
+            },
+            {
+                "generated_output": "losest",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 21.09374453125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02857972412109
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02861863937378
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00399328379631
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02860203857422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03162168159485
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014150360543
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02861169700623
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02862257461548
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "losest",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02862077026367
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0285620349884
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10085589599609
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10106121520996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10081992950439
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10044940795898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10086128234863
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.83313747558594
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10114161071777
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10087272796631
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(1.21,12.12.1222.12.12.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10031130218506
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10112250213623
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.10041050872803
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0017188848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00013503939696
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0758423044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00513503217697
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0827385024,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.68166037597656
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0772127508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00613275556564
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794138504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.80325086669922
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0798139332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.24253557434082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0785656348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00615579462051
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795381516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01222504081726
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0749058542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.34332628173828
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079436085,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0061179110527
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0765729902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00095279679894
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0771943954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.04999035644531
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0767606496,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00657877340317
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794034514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.40326540527344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0777021782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.89140794677734
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0769585606,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035545379519
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0768043386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00659000711441
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0768217568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.14889228820801
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079121363,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00659668998718
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0757125026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035890999436
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076933827,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.05980915527344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076594546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035801400244
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0767185722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036425360441
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0827557046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00516623125076
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076392283,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035545379817
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794302947998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07948468322754
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07941094360352
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795041809082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02078815803528
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02078274688721
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02078400230408
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08296484375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01762799911499
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.68e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02846225204468
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01898258323669
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.28e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01888646011353
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794271774292
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02005820999146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02082821960449
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01804700508118
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0088043114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.18021502075195
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0084633122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.83676007080078
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0082452072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.86717590332031
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0092930176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.13950854797363
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0085804972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.13973889160156
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.009485044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.13583089599609
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0084927186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.12962565917969
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0064786774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036159779429
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.012171926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.36658303222656
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0076309776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.13831879272461
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056066672,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.22808157958984
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072581898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.93913726806641
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.007964566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00041961379647
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0103562418,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.13222331542969
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.010084751,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.04780999908447
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072353938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036406179965
+                    }
+                ]
+            },
+            {
+                "generated_output": "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0101595636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.13880662231445
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0769118138,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97798607177734
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0767437068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97781462402344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0771416206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97799722900391
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0770484274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038246179819
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.077221307,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98510096435547
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0765412444,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98506774902344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0771424082,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98470732421875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.077211238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98486014404297
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.077226228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98475676269531
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0763547346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98505590820313
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0769114774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98484287109375
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0783248636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036412580013
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0769783986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98539520263672
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0773252158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.98521640625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0787654452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053478199244
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0765942802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.9794833984375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0771494038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97941965332031
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0796923054,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97160509033203
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076697346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.97922575683594
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0800234404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028818999827
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079749169,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028470200002
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0817943698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028428580165
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080432477,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028751779497
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794381712,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002893737942
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0789227702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028975800276
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0784012876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024415780157
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079176009,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002785899967
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795201558,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028367779553
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0793783184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028537380099
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079641131,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028214180171
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0801421238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028025399745
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795280406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028156579435
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0089884986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00037628579736
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0089749954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00037971000373
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00906509,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00037820599973
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0098392328,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036454200149
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.009047966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00044991780519
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0090330352,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 2.1967798828125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.013464959,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00030041460991
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0135125562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00033065439761
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01340837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00026470320523
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0133002272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00032825480402
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0133014904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024006360471
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0133417618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002535679996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134285848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024659160227
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0135931718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00023503839821
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134655482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00020745459795
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134653944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00022025460452
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.013511805,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00030166399777
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01302965,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00022316799313
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0132866666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00026102398783
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134020046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00031491199732
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0126516376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00019923159927
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134275442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00021916799396
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0135124514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00023174399436
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0133566512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00033283219934
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268498218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00018127860129
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0259179052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00398834028244
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026857392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.61220810546875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014804954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.696470703125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027156002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.3361791015625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026818184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.59862216796875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026834284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.5977640625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026274598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.579813671875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014720738,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.760426171875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027182784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.6536849609375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014600696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.7125384765625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014667176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00251468801498
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027255458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.6124767578125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015099466,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.77039462890625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026533812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.6030248046875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014865992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.68248359375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027155364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.35333984375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014982022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.6712833984375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014492472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.681598046875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027343214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.59679736328125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026864798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.3507845703125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026915858,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.63622275390625
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00064313539267
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001415356027
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00061270339489
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014099160582
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015187140033
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00058688060045
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014659160003
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00058995098472
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014367960226
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016937599406
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00613110332489
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015142359808
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00019408000074
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0092749302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00040034999251
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0091662002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00112962859869
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0092201076,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00112514878511
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0091344984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00299580221176
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.009122367,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00299996175766
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0088843742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300373740196
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0091412284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00301663422585
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0091266306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00299737000465
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0093951422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038310180306
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0091493008,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00302255177498
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0090737474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030436712265
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0091099676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00113340499401
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0091942522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038233379424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0089908616,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300997681618
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0089389764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00112025160789
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.009347706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0011239320159
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0091175176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00113455640078
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0090686362,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300197739601
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0091869082,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00045932558775
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0091518588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00301842460632
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018897542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023017480373
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018822848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002032308042
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001909224,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023721459657
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0019053082,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019599879831
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018378092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023737460673
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001900732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026985400617
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018866104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024255860597
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001862959,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019340660572
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018827286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027398240268
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018790528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019743859768
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017653526,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019977460504
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001879903,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026483060122
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018003742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026985459626
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0019004968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026787039936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0019171324,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026156639606
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017621248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00022582260519
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001891625,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019507060349
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0019016448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002007981956
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017844644,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021689440608
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040990948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001694069989
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041250026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017919900417
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041144064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017865500003
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00414569,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001652470015
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041288884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018118279949
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004097723,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018547099903
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041213876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016572720259
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041202594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017711899877
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041183854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020713479593
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041171728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017865519971
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041029312,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019782319888
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041257388,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018419119641
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040808594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019238259643
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040979312,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001797431998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041280706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018815920204
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040248686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019743959606
+                    }
+                ]
+            },
+            {
+                "generated_output": ", 0, 9, 7, 15, 64, 3]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ", 0, 9, 7, 15, 64, 3]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040846056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002016631946
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016492486,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00022262299508
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017548628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019987099841
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016943376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023551820442
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016947114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001836469993
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001664936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023987080529
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016528284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002386229977
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016761738,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017564679757
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016511602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028019040525
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016982194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016457500383
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016657502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021027079672
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017046632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00022230259851
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017103294,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019449479654
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001680972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019791880026
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016933988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019878300279
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017003288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020415900424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016976006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024271880537
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016916388,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019385479614
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016409894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019487920254
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016458934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021174260378
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016567252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00025366280377
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021641718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023500639945
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002201829,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028828659654
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022079992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026857420206
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002214735,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001694066003
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022114944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026556639522
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021750254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021791839451
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021668038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024451000094
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021492318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023433459848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021672872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023769439459
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021902338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023167859763
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002167244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028652619421
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021731802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023663819581
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021728762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003064621985
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021905454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023212800473
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021802694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023148800135
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002183313,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023760000318
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021719916,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002510079965
+                    }
+                ]
+            },
+            {
+                "generated_output": ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002205875,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023155199587
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021736654,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024352039546
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011124368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027574219704
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010942218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020358239412
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011061452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020857460648
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010969356,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019811019897
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011022252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000203613998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001122127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020489480346
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011124312,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020755040199
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011177928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020438259989
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001120181,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020172659606
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001119027,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020239839703
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011109812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020124660432
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001120781,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020217439979
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010751754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018633439988
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001099598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019468799531
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001093414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023004819602
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001111888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019302419573
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011075762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058620921373
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011085336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019635059536
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011069714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021727839559
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011107674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021679859608
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001340286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08288803405762
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001359022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.30071828613281
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013267742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.31499799194336
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013432462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.31688477783203
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013171084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03688785324097
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013424502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.30212722167969
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013416896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08270039978027
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001331144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08268932952881
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013410736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00664258403778
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013516676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08262218475342
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013511322,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 1.3138130859375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013268706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0825705947876
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013154704,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08281484375
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013238164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08280071105957
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013453504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0824926651001
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013493444,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08253198547363
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013216586,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01200734634399
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013376022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08253574523926
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000479943,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00131551998854
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004668232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00565707912445
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000489165,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03066071929932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000480031,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00040428718626
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004792832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034678300321
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004704612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00142617359161
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000482275,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038201519847
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004745432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00124389979839
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000466253,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036223939955
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004930968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.04565872039795
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004882768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03073452186584
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000471553,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00136486136913
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004690714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00129827001095
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004703914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00145961360931
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000482803,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00037695939541
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004710534,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00132063860893
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004704552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00133340477943
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004783532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00103955100775
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004775592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036420039237
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004831288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00134239838123
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2379689966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02652939872742
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2368290116,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00072156620026
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2370378526,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02601959686279
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2358835936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02631212501526
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2353197518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02584297103882
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2399325434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02191806488037
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.238349716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02203754997253
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2381259114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00042489519715
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.238619072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0220099937439
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2380829238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02647564353943
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2402367984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02655477905273
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2382034802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02204614524841
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2375138442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02201574935913
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2364678214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02340918922424
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.5598989826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016147159748
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.5417934714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001530232003
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4936607272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00049721500278
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.5271523426,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 3.046237890625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4828862244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015065540429
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.5143683184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015641560033
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4857771188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015507160574
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4756149238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015286339913
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4982896864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015865560286
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.503998334,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015430359542
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4916985792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014943960533
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4900187472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00751640329361
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.4953199172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00045276761614
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.539659532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014745559841
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0146079962,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151135616302
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126899512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00146841101646
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0153774732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00146262078285
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0149691592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143890919685
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013035114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.53143688964844
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152454192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.53902431640625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0127321628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.747684765625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0138924002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00144777259827
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0143816832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147513241768
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0148800062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149781961441
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0151530864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147999560833
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.014773808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00146146759987
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0150226342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147554841042
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0140458502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00144553198814
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0167879912,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147743680477
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124681906,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014677079916
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0135177834,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00148188560009
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013594089,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147321200371
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013510684,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.35532217407227
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006309454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00336773838997
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0064094006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00335196499825
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063255782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00335548219681
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063052004,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00337621879578
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062580886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00336773920059
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063968728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00336505079269
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063056944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00351845774651
+                    }
+                ]
+            },
+            {
+                "generated_output": "(4)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "(4)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062899938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00337714619637
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062900836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00349730625153
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063361776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 11.555346875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063137898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00349878082275
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062621564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00362613983154
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063285218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00337081098557
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006301856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00337957987785
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063167978,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00349525980949
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062314154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00339036397934
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.047433705,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00170543584824
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0469545798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169129159451
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.047445333,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169286019802
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.047584413,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00170006020069
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0486271494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.4933078125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0477868732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.49345166015625
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0473306012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168665440083
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0480903038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.4957044921875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0486605892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.49479453125
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.047023689,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169084563255
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0473559122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169369301796
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0469393178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00208972501755
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0478606368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169858779907
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0472399618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00195740499496
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0502962574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.49355185546875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0467724518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00166793358326
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04691011,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00164918076992
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0474044068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00170130939484
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0462936786,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169129261971
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022519936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039660660028
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0021271578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038470260501
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022619936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041657479703
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002245522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038876599669
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022154102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039699040651
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022029762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040265459716
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022711014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039398279786
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0023119886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038748660088
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002283619,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039295880198
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0023171266,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034668619931
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022773732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040425460935
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022166358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040003060699
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002283685,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040543839931
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0020543904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038950280249
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022061844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039452680349
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022414354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038857460022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0022403356,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034489479363
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022981308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039903860092
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0022313896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037679860592
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017161520049
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016387120336
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017078320086
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001629110001
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 7.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024297479987
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016598319858
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00020601499602
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016371099651
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016975959763
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001665909946
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001767030023
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 5e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017046319619
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00022511900067
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017916719913
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098764619827
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.2e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00018604719937
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.6e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016841519848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 4.4e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001660152033
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002903458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001556152001
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.74426e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015785480365
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0001921794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015743900463
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004033982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016665499657
+                    }
+                ]
+            },
+            {
+                "generated_output": ", 8, 2, 6, 4, 6",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0001924332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016047920212
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002916198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016719900519
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002988436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019148700163
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002914478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018540699482
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.94026e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017331120148
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000494841,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016310319901
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000195289,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015500699878
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.7e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001819512032
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002015692,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016719900295
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002131772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016793519706
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002205646,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015951899439
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004093302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050582260936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000200855,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017590280324
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002019032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016377499998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0001977312,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017481519803
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.14e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02431666488647
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.14e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014812679738
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.2e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017027079538
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5.68e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073439860791
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5.84e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015494239554
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.18e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019619060308
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 4.78e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015391860083
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 4.04e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001778545998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5.96e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016895879805
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.1e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017350280285
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 6.678e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015174279585
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.76e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02446261863708
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 2.64e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018527839556
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 4.14e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015209479928
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 4.66e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020438280106
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 5.36e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015379060581
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 3.94e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019993480071
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.1e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.000162750002
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.92e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015814200044
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.84e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015094180256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.84e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001583019942
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.98e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015430180356
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.88e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015804579705
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.72e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015398200229
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.98e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015887780488
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.98e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016754980534
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.08e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015935779437
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.78e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016751779988
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.12e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016073400304
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.58e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024393380135
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.98e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016249380186
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.5e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00019628579542
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.86e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017439779863
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.06e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016681440026
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.58e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00019084579572
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.94e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017743779495
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 2.82e-07,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00017602999806
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 3.8e-08,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02341337089539
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "losest",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "fft:",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "fft,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "(x,y)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0186513914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042150159478
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0186238458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078825380802
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0185956904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079529359341
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0187250156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078124560118
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0192914314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081430159807
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0187418254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079247720242
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0188908532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079183739424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.018986536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044921380877
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0192191264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081702181101
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01897659,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081926279068
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0188417838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079170939922
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0190555946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081350160837
+                    }
+                ]
+            },
+            {
+                "generated_output": "(x,y,z,t)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0190940882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008167656064
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0191729446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082412559986
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0192871832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081516560316
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0186162608,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079394940138
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0189100406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042028580904
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0189774056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078281339407
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0191271418,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004388137877
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.112025254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00408466315269
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1123125832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00262655639648
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1121034056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0004142072022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1127803142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0004420474112
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1129556798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00039801459908
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1107408336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038278340697
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1121974046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00041158279777
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1114107838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00037811140418
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1112922716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01803119354248
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1123356828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01740890731812
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1112891556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001743033953
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1135775926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01754983482361
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1129319264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00033574299216
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.111818404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016252739988
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1108974874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01749309501648
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1098649182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00038054319322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1104121264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034051139653
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0040493338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002530226022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0039677866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024758320302
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040388458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051628760695
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0040076284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024825499654
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003988945,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00023987119943
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0045022218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034918319285
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.004196058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035676699877
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0043125158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035292719901
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0039863866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024495940208
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0042066716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.000347519207
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0043457158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036198299825
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0042081014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00035459160507
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0042341232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00034582299888
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040143844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052911859751
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0039551252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00024015939981
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0039683772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00023107139915
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0039968348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051657499671
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0043282402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0003501750052
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040501258,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053052799106
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0039484792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002393274039
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0657682064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03470630645752
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0688671376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00018342179861
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0659746392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05516881103516
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.065532884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05762346343994
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0658074536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03484842453003
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.065442741,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03448813323975
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0654034518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03471590957642
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.065751821,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0350957244873
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0658167396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02963634796143
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0658864146,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03417206802368
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0649350828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03457290267944
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0656901098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03518867797852
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0658584354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05885209197998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0652771502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03482179718018
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0655687918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03558220825195
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.065271676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03490352783203
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0655184704,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342923576355
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.06566617,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05334672927856
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0652294666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.60385516357422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0688820674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00498344640732
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.108934647,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00139353640079
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.108893928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00021411200017
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1084817048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03563882141113
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1095851822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00134537498951
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1110854894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00615987234116
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.111406045,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00119449360371
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1108100414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02581163635254
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1092421718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02557184143066
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1072714304,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03590777664185
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.108891437,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00135554940701
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1087659668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00143378961086
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1073298212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00140841083527
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.107767129,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00132435059547
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.107129838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00028051080704
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.107922961,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.000199935399
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1083860562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.01278445930481
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1070680568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00615659761429
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1077995968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00550817937851
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.107337519,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0012813090086
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.107104128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03563785095215
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128935406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02657790870667
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0127401986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02659211006165
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125802626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02644220123291
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130473062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02589314231873
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128410072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02647614517212
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012987173,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02586828117371
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012768934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02649993286133
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012750494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02649008712769
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012808033,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02652662200928
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0127975832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02647041282654
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128158806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02639934120178
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0129009758,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02643481178284
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128856518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02654962844849
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0129134456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02660499458313
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128123112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1249686920166
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126941444,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02648770713806
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0129311352,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02587154693604
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019056254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014521540459
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019363132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014620740265
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001946829,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03984767227173
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019239232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015599939972
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019880402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015635139607
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019446192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00016284719892
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0020284574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.04031388015747
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019602552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014745520577
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001905286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014847920425
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019646568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015478319712
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019449292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014998319857
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001946691,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03922933502197
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001953115,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014847940505
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019381972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015519920588
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019190694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00015302339569
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0019573384,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.03999644088745
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0073968902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0002998712033
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0075533598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014959959984
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0076256956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02081744613647
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0073550996,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00036316700876
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0075386632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00014297559932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072752032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0001625913972
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "K",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072634132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00019503919855
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047061864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11676962585449
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0048283052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11937540740967
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00488808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11922330474854
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047276946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.1230026550293
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047486424,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0508768371582
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047749938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.12612623901367
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047533956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 16.3077357421875
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0046502452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00044361538887
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0046553848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.20636590270996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.004796339,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.12086191101074
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.004855634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.12051211242676
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0046389212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.19221842346191
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0048187468,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11909213256836
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.004670305,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.19249644165039
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047661196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11681466217041
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047647852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.12014090576172
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0047410536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00023235200793
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0048290344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.11943150024414
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0048597322,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.12297581176758
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0046139856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00044716739058
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0345185578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058719780445
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0339819072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005741738081
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342225496,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00080239801407
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0347624882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058226980567
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0343637578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061894180179
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0347254704,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00080643000603
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342151578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00080514979362
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0348724066,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063545359373
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.034707981,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058201358914
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0344755482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00060850999951
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0343601476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064473400116
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0344893698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02619782409668
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.034698521,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03399857940674
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0344910838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00085903699398
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0345757268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00069526178837
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0345314154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065494180322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.034296877,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062415719628
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0341324848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00079852517843
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0341965522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061154979467
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0342157662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005887016058
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.138668087,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086127659082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1393265524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.02587054481506
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.13839987,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02678964653015
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.14573338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02584879379272
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1399020542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087999659777
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.140634694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02654312324524
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1405304392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089910060167
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.140565815,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090364518166
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1381286978,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090780501366
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.135662059,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090729320049
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1402784924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092018860579
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1385208162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009057246089
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1383902256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00092678081989
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1392154982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088889280558
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1394411266,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087497261763
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1395117178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088889260292
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1410762126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086460479498
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1400194808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085775719881
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.14034836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02631934204102
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306176084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00068210960627
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0302311778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040924578607
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0300078994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000676957798
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0303399344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074012559652
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305090628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042422139645
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0303959924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00072313360572
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304317938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075385341048
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305451464,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084124518633
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304318242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076354880333
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307318774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042460580468
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305819778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042050939202
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305145448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042428559959
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0302690262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00066188541055
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305873416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041977379024
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304697172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042594980597
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0303959894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076643218994
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0302930892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042422420681
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0961635416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00081753461361
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0967045438,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00096511820555
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.096992709,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00102444719076
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0966408626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00097330999374
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0960902488,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0009974059701
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.095630948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.0010251513958
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.096884854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00093471879959
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0969819564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00089334199429
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0969253698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": 0.00094105520248
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0308030928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074073162079
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304257154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073893939257
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307289782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079637899399
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030787761,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076101920605
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0308320172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079532580376
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030409707,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077062199116
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0291103398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009570217967
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0297000794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088451040983
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0293837778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099058939219
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304857962,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096946799755
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0295992008,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093509919643
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030683087,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080630038977
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0302802498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007463003993
+                    }
+                ]
+            },
+            {
+                "generated_output": ",",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304466952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074674838781
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/kokkos.json b/results/015cff6f/gemini-pro_temp0.2/kokkos.json
new file mode 100644
index 0000000..2e11cb9
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/kokkos.json
@@ -0,0 +1,37126 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "0.0)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2016204556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7378044816,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5607585016,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3251920462,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1740314756,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.090366491,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0455285524,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.7263997872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.468016111,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9683761732,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.219509713,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.8418931196,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6628124044,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5514493308,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.7358603502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.8844742378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9427477368,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.2096342438,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9434435704,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.1449145112,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.1192048724,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Kokok",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0386020126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0919273778,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.037135046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.086959529,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.043577482,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.021994718,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010904635,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054614604,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030558982,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0840105598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0395352846,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0394001584,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0395352864,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0397045628,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0392511548,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0389543668,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0842080628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440105438,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0443476394,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441886626,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0438635552,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440124012,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0435127546,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0841328296,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0452058898,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.044933462,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.042714697,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0432330592,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0436852668,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0444367346,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084266791,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01946581,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195119806,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019623919,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193766124,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194564984,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192351948,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084305664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195002066,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195192506,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196394644,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197399512,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195231276,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01921339,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0842841156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0386721044,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0387407236,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0392473174,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390489824,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038924515,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0383880146,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0840364644,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1064914454,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10626428,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.106463088,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1082504406,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1064739786,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1049337384,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084139592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0396929628,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0397784716,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0401840836,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0402758962,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0395988098,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393928788,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0842222094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0420163726,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0450162696,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0403346106,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0454525296,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0422991422,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0396364166,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0840018432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390362426,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390197998,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393732478,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393539076,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038660168,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385577858,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0841014884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039895393,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0399509234,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04027896,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0399781966,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0395951752,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0414146122,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0841875262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0397722878,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039887452,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0401691998,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0401659056,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0396399872,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393798562,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084389185,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0398080972,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0398555342,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0403204538,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0399817894,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039581531,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393867112,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0842541572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0449543174,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0448667448,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453693208,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040189556,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.044018962,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.044486552,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Kokkoss",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2592341572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.146535099,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.072689925,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0365790512,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0187836222,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095537202,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054693472,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2670649094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1546073114,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079975878,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0380247722,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019146442,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094425492,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005289241,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2693290524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.150869808,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0752669856,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03716054,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178958398,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008879622,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005590106,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.267796344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1550220088,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0767859358,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382728824,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193932134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098340372,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005167801,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0666012276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562893506,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283013806,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014088526,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070823384,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037585122,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022337556,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0665833882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0536171878,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0267937118,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0134072376,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066892638,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038154556,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002561497,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1849005292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0587128008,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293320796,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149144192,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074848026,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038871902,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020395926,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1182230422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1058341424,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0565233264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0502212904,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1528535276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1180835378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1475962306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1405533284,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.152858414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1356680432,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1460123214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1181506576,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "Kok",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1530268002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1183689378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1529249128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1182607386,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1528373104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.135460457,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0319042578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.036515544,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180982732,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091288664,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004625072,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023156362,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011586392,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0319615354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0365848942,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0181088108,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091815618,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004594314,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002313973,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115678,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0309750274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0309062398,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015624582,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0079604606,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038938918,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019143516,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097285,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305105946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360616366,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180108508,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091708592,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045629416,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022910832,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011426934,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0336048918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0361378138,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179187536,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091230578,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045510712,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022573186,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011351314,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0369538452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0235577736,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118660064,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005904765,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029759844,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014783,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000749915,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0312150904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0350958596,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01841155,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094894418,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0047579856,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002276683,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001108257,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0367860824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0234494102,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118616064,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059193064,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029963936,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014612622,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000746024,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032984458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360140172,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180673372,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090483514,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045330032,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022579756,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011223516,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328237438,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.036041533,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180247552,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009051621,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045500432,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022654014,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011355702,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0318930044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0362383082,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01805298,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091937424,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046103022,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023128468,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011304846,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0311331662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0301622978,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0151302692,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076581924,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037996648,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019401004,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009488374,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0318872736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363265346,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0183484092,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091409112,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046520506,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023295462,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011342478,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0323432046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037694768,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018902388,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095430184,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0047523436,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023936888,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011696842,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0311401376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040319986,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0202105768,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102382796,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053841866,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026734846,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012569524,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.031908251,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.036297652,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0181744744,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101211694,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045952038,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023096006,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001125315,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306384966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0361049004,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180496078,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091132852,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045187922,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022975958,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001133717,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03236748,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03770531,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189053042,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094550316,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004802556,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002414782,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001192339,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030555513,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.036128274,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180202966,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009290196,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045790516,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022819124,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011495926,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738573346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0270651208,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130885158,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006606957,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033510278,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001664602,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008445032,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0739541566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0263688504,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131227504,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066854682,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003338045,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016475326,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008351236,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0739540002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219776642,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01093985,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055528196,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027922192,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013716618,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006903956,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738056614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.026320201,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131924512,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006694884,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033089508,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016710364,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000857055,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0737637488,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264127872,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132584552,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066788804,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033413576,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016504236,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085235,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738874808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264376136,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013218334,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066445074,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033031816,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016705332,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008642682,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0737782938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264487384,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0133014922,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006706049,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033120226,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016931118,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008677632,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.073832037,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0263912404,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132978032,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067153626,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033677714,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001727693,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008314988,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0739880522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219624772,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109786848,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005668924,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028314072,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014071508,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000705033,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07399318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219447946,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110183814,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055384344,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027762856,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013951458,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007018992,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0740121514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219364244,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110182156,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055430154,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027744942,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001377691,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000687063,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738591864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264213054,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132374358,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066500196,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033693104,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017271958,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008633826,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0739778852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0258089484,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128824808,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065497694,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003261776,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016228164,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008547124,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738487064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0257574596,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129750146,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064762172,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032465694,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001661715,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008176488,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.073880474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264546676,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132153422,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066499156,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033490584,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016827966,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008461362,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738701546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.026426689,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132920164,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006655316,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033497874,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016597734,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008656968,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0738446586,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264710086,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132826312,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067000374,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003332523,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016755228,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008577482,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": ",",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.073929909,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0265285988,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132126216,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066277876,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033020458,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001673329,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008488434,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233790982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02497432,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012312476,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062710236,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031320138,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015418018,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000776768,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233387142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0248241254,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012356867,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063208994,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031453414,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016018664,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000775349,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0223184604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0247338802,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012338216,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062280606,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031504404,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001546216,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000779593,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0223285212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024764283,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123516332,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062672766,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032228712,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015703914,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007791164,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0222816552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0247304214,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012418016,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062505214,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031176454,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016030204,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000778778,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0235536698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245538534,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123471566,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062298598,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003127213,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015456148,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007777716,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0223024604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245804354,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123205246,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062453088,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031903236,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001534077,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007750164,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0235238436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245613334,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123566242,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063094316,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031542438,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015471858,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007824488,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.021476488,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264482056,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141284684,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071311016,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0035754956,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017882386,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088332,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233388198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246721818,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123976516,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062024068,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031381214,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015949916,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007761348,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.022290059,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246304034,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122983588,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065167332,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031220136,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015727196,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000782288,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0223451608,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246220656,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124769106,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006180187,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031039076,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015813518,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000774147,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233892348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245808144,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012318168,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063947862,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031406164,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015482436,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007753532,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0221790946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246814352,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122368216,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063162284,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032563396,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015572384,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007754816,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233591028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024894506,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124153082,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062462774,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031272906,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015568694,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008060564,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0214566714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027919849,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0137386288,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071036306,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0035732326,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017899554,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008842596,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0233281864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0251001202,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125777792,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064165874,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031814102,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015781036,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007942086,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.022252413,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0248145336,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125397696,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062971812,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031339512,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015353348,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000786344,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0223106186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246560096,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012387331,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062965698,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031537404,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001565157,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007745874,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.022313117,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246984216,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012437668,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062316676,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031427536,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001545007,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007731892,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405082662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0290277908,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145474784,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074892694,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003887265,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019215868,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009656024,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389566806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292826896,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014699526,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074656828,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037509626,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001866341,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000936436,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0388937922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292312872,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146285206,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007433224,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037034152,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018502288,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009370268,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389248044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.029221715,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147211344,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074265492,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037005198,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018539076,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009287614,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389512846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291839062,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148923464,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073945112,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037079596,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018680326,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000933777,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.040498676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.030014389,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153247894,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077837088,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038889238,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019336976,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009736614,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389557446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292645418,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146683418,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074759906,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045306134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018512712,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009287234,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389450366,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292048556,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014611515,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007372051,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036994678,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018570014,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009283804,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0406906168,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028832165,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147287488,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077786772,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039120674,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001944999,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009656004,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405121272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028815664,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146105628,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077787272,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039410754,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030720864,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009667286,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0404833472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0296944514,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153511026,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007671147,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039371242,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019247788,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009671916,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405276506,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028963039,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0154032928,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075892232,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038484134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019330504,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009780682,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0389289604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293319866,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146879342,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074047326,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036768552,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018829804,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009311178,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405079034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.029503102,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0154634114,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007851502,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039537716,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019441128,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009645762,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.040503806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0289382284,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015385922,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077901648,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003877005,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019723866,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009752706,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405465158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292192022,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144924852,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075958414,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039121116,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001933868,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009704156,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0404962134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0288983794,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01536359,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077673818,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039276924,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019485236,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009630156,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405441692,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0289581978,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148965104,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007622147,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038615562,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019484976,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009779838,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.040529554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0294719492,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014571123,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076931742,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039930974,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019646158,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009651116,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.038915006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.029259863,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146156444,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073703656,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038647302,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001870429,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009174802,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113945578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242696236,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121403394,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060556932,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030616172,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015438692,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007567078,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0121806974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242864388,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121465888,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060833496,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003041437,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015134978,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007564612,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0133339806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0345955438,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0172211826,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086682148,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043226506,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021712156,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010720044,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0120483744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242003954,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121222114,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061372104,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030291164,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015149084,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007574874,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132349904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338876714,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169497586,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008545322,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004296772,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021168562,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010523214,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0121624248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242729196,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012129671,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060690888,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032340014,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015147128,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007586438,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0133255186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339218192,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01695567,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085346358,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004258472,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021177194,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010538242,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113966318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0243400118,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121525,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061117228,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030862614,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015132878,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007566278,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113578226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0345067726,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0173410702,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086785664,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043638864,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002142498,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010728022,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132444488,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034567705,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0171921784,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008670566,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043226524,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021484784,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010728324,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0133316406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337943542,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170265712,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008592722,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042774674,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021326002,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010540452,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132671354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338689524,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169726658,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008504963,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042723042,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002114236,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012638994,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113950528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242220776,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012170428,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061236434,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030514832,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015219626,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007584456,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0121137036,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0248565734,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124913272,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062802074,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003171539,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015632302,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000780791,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0121046446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024963593,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124343564,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006329829,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031526532,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015767894,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000778545,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132306868,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337944402,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169032396,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085481322,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042589206,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020980454,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010533118,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0119826872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339156422,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169488452,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085765836,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004291109,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002153327,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010557542,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0134075032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338464662,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169522692,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008520835,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00434148,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021217378,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006570401,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0121434874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249059506,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124873598,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063206112,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031620668,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015759144,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007785796,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038959866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.4866958708,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.2975084926,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.2123397968,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.1694215944,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.589990916,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3501974728,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0039345612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.5896603974,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.6677253454,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.323545042,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.2040655612,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.641268217,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2949853218,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040592052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.9416433238,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9891297138,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.994648863,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4938319982,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2609403632,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.154978809,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038992986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.3020197018,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.2928721064,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.58327724,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.1847917664,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6638389344,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3631785982,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038917894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.4565759502,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.0419914542,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.2978513428,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.2103776582,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5990174148,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2949760648,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040726192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.9447862878,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.0811189922,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9933474738,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4934066124,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3088617548,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.143558518,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040612298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.9735404548,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9954377836,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.007569915,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6279599876,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.245568652,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1387699354,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0039391824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.949233728,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9731701322,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9966298922,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4949619374,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2475681032,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1486255518,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004312047,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.6028467924,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.1819099312,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.29289469,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.2946961322,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6224534378,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2891125322,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0039019648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.2910398248,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.5199362006,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.3011595036,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.1609607218,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6147281804,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2898375162,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004062278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.9470845244,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9831268044,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9948026406,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.495492218,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3035200004,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1522318048,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004062601,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.9414863314,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9799446246,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.139486586,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4937401382,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2457827638,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1382821752,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0039036302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.37963281,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.198184767,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.4076867484,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.2097344928,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6403494992,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2906725756,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2986717806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4179890726,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4260370136,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4191548918,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4346775904,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4159930116,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4126278872,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0252551472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.3795785722,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7090535368,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4546251872,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3543502484,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.227521434,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.376637304,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0436136976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0370959748,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0448698244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0433225064,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "{x=1.5,y=1.1}",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0449596554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0435664028,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.046168253,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0473000776,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0264794484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0446387642,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268380084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0383572952,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268683976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0383104148,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0269796666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382935434,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0269021648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382778634,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0367885306,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0260634026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0409484288,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0269729198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0422024574,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268752402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0421839724,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0267001388,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0409940224,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.026856994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0384024942,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0368161928,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0260538742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0409338412,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268842348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0383458116,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0263957412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0422322196,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0261864898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0460629626,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268609472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0384079908,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038077422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0136633524,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068249604,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003436176,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017261822,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008914632,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004631164,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038057102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135376842,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006834807,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034530856,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017023534,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000851224,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004472064,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003811483,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135772722,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068712314,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034248726,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017301516,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008642826,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000437957,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038466736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135817806,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067655328,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034184102,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017579658,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008764752,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000456514,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038663302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0136042794,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068242318,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034315932,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001702245,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008429042,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004467096,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038067946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013569877,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068302808,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003389879,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017337608,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008593952,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004455292,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038094214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013697819,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068259304,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034030794,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017070602,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008460582,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000439426,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038069788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01358025,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067928242,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003435158,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017222108,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008458598,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005981042,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00381403,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135559266,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067967274,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003429828,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016953164,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008447578,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004951026,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0038646108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135544858,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068011296,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034307012,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016941082,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008421988,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005020554,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003968025,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115808392,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "(121:1:1:1:1:1:1:",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0129569236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338533354,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170148504,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084906056,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042834804,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021352972,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001872345,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126109072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339721144,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169949066,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085704276,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042668472,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002116712,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022807968,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125039358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033890164,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0171559454,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085276492,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004328435,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021690242,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001797969,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124669508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033901439,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170149304,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085506482,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042631706,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021193852,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002424124,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126901866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338447128,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170203828,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085971438,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042593514,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002182955,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017942942,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0122641318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339063406,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169768828,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085761522,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043734014,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021448032,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002326404,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126115706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338419558,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169657862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008508393,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042916738,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021303538,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019249218,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125624302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339372928,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169744166,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084914432,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043013362,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021529604,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020337946,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125324056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0346282858,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0172793412,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087159572,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004401883,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021934646,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017209862,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125660454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033831474,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169688098,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085593586,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043133706,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002126443,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020477964,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0134694232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280641846,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014077652,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071427442,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036289604,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017921684,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016646486,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125959692,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0344807666,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0173148358,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086789576,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044148496,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021658204,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022458488,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0127237794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338812312,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0168875068,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085960978,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00431945,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021821672,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001725713,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124858362,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339220218,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169091918,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085529248,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042785894,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021151632,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021119152,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125557416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338556154,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170315358,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086265068,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042988996,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021422542,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015754146,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126558918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338047396,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170715008,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085051406,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042946696,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021167684,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022559938,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124727294,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339094948,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169191324,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085745014,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042708726,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002142232,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016981518,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01263456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034042789,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170638404,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008604744,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042870512,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021338724,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021059378,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124071084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0338006362,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170103916,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086012194,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042957136,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021180628,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001664905,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125713968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033995003,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169480168,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085185342,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043051632,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021077116,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022738394,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079840285,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.186691699,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0939653256,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0544934856,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0277353358,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.026233786,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071710044,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0816845888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1861361592,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0934453204,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0572062998,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0286588596,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014244173,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074576554,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804994338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1866587966,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0933991684,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0556180186,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281944596,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138227808,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126816848,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799629678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1863631126,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0933324448,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0558419796,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279494152,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0142193502,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073441014,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080023776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1861074262,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1113559046,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.055868481,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279527036,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0261300802,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011294109,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080438363,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1864293974,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1105726666,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562923414,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328072686,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139706912,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015453255,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799262106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1867833526,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09383644,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0537577626,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0272952976,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208665244,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0137526292,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080225593,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1862822628,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1113569564,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0470888016,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027836539,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0171083838,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071437932,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0800036044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.18664168,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0936279664,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.056042839,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028054659,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141969022,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073345358,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080011137,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1864698762,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.111907862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0556689182,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279898452,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140989662,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119803466,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0802786656,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1867442604,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1109029556,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0549109842,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0272603404,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138047956,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143557746,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799417126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1864464624,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1116924416,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0558788352,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279997498,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140467482,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072874772,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080208598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2068525054,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0935894706,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0544767482,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0275711326,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0185668044,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007117107,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804101588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.18736346,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0936059272,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0550811176,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274564186,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139586724,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119424574,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804462624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1861877904,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1113034862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0560394006,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280076544,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01403611,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080227714,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0800212374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1861809936,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0932905766,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0564431814,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282730072,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140194648,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072259338,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799003906,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1862222552,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0937213664,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05504907,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0277549972,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150371666,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072079316,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799802952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1863117044,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1112726274,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562558878,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280864782,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014015529,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007242877,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799244598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1860259832,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1108712432,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0568265604,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283485526,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189664688,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007302313,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079990159,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.186484479,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0933507152,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562975306,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028135236,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140498354,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073278852,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1732092232,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3471880506,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.175371324,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0873190602,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0436340868,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0232400284,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122906558,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1751338142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.348657358,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1755634114,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875826052,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0442893056,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023023862,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143018832,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1742618996,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4503799422,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2264814502,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11388019,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0570759192,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293589878,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158814396,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.176617811,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5119341072,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2569302876,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1296025664,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0648937692,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033404119,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0176059204,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1723435084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3486609008,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1746742604,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0874493832,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.044008013,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023213661,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124340788,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1740990076,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5116168268,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2574175274,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.128996834,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647996238,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0322870122,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017736277,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.177705593,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.347595304,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1746569438,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0880306734,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0442037246,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231755212,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123437592,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "mv",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1744589056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3475181688,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1743717214,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0878151606,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440533422,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230352296,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112914858,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1766940762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3486009378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1748605024,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.088065879,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440863668,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0232079122,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012351131,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1745771104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3482546368,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1743094828,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.087954776,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441411316,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223655472,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139331842,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1750257792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3476423004,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1747963198,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08779202,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440089954,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230915334,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012269163,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1729584802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3476945264,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1746466746,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0874473806,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441630328,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223104374,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123005596,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1779115158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3480063062,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1740880608,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0877617652,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440417748,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231069224,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122678428,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.173751302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5106888732,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2574658776,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1292728846,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0648648392,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033146987,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017051172,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1730134434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.511258213,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.256912261,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1290191898,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0653001634,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0331333992,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0172590446,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.172087353,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.34798155,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.174932011,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0878862212,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0442106944,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023207221,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114106458,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1720627788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3505357962,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.176966941,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0884849704,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0444012724,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022638771,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124877878,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.173466527,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5121918998,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2559798206,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1293224234,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647112166,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0333269888,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0169223876,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1734710492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3480487062,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1744660048,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0879454282,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0438722888,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230419866,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122349492,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2184958334,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7373475128,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7361067468,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.743380677,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7408793784,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.733545071,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7280202846,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2065711034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.738450053,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7383398318,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7425981038,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7405704278,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.733161388,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.728512267,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.20421609,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.982831415,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9823220196,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9874805076,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.985145662,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9782737436,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.970712051,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.033259011,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1785607426,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1799032418,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1806880486,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1808169188,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1793919308,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.177656423,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Kokkoss_t_t",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.131227913,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4386392404,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2199941088,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1111058794,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0553032454,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0278194742,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0136047368,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.075749051,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.223741811,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1121141758,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0564422028,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028193044,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141594874,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069630884,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0764099498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1882084382,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0942700988,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.047258813,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0237865508,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011917128,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0058747184,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.076131436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2232956026,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1122678768,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562770622,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283399764,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140394034,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069748346,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.075736057,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2235276262,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1113750202,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.056709351,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281767488,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140316086,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085462664,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0755849788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2233476262,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1117608036,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0560365342,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028254032,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141382572,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069627034,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0848281914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1583931152,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079706346,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0400898938,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0202773346,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103217494,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049534932,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0824856118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3356595274,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.167845807,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0845978402,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0422427464,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209948914,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147721086,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0801973928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.223814704,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1120549808,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0560724176,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0305752196,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014236117,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069857648,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0757763264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.234626681,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.117708235,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0591573504,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0295698798,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014812721,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074763172,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "Kokkoss vector space.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799889426,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.223768822,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1125869706,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0561619216,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0284903122,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0151965742,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069430486,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0785557752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1645578188,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799101948,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039822443,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020046196,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102308666,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049807322,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0765699404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2241150688,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.112501137,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0564349812,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281727134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0142367024,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007285857,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0759227196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.223596412,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1209515054,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0562327192,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028278338,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141909964,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069876424,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0755997128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2238797044,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1127944184,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0564285912,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028213059,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014007726,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080426954,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Kokkosspecific",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2257794856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5799058302,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5804175004,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5806649672,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5856878086,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.57880015,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5743564416,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/mpi+omp.json b/results/015cff6f/gemini-pro_temp0.2/mpi+omp.json
new file mode 100644
index 0000000..626f9e0
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/mpi+omp.json
@@ -0,0 +1,57611 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029543604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029814972,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002600821,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017161086,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000978967,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000524825,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002732046,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000145453,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001444027,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014487035,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002953006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029421056,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025692898,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016943442,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000967009,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000518471,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000270788,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000142686,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001464645,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001479141,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029516474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029731712,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025997146,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017135602,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009780552,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000522665,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002739142,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001452126,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001454043,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001509036,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002953699,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029725186,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002597769,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017133056,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009784962,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005242156,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000274381,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001461482,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001477221,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001493508,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002952755,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029724664,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025991756,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017128504,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000978306,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005241838,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002733832,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001450016,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001477592,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014966845,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029537896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029801586,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026046102,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017165798,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000980396,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005250134,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002750438,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001443006,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001470559,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001496279,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029524664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029730592,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026203018,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017134582,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009787028,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005241058,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000272846,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001440924,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001474825,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015160075,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029515132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029385124,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025694784,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016946704,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009672772,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005181382,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002702774,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001440082,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001462033,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014714875,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029522446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040321952,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029543306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029763656,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026036538,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017169322,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009796966,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005247146,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002729746,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001448794,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001444439,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014482335,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029542182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029379816,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025691254,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016954958,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009685116,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005191162,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000270738,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001433186,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001462963,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014928465,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029545268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029392442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025751188,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016941236,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000967063,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005183408,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002700686,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000143762,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014512,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001492912,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0830644342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0641417658,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0482196928,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280994846,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150701864,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078004814,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039718944,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020179008,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0830692872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0641951058,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.048157153,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281008074,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015067872,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077908632,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004149297,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020341136,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.083077118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083105842,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0623395314,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363658204,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194977758,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100983256,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051519388,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026344442,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015793323,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077099825,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08307076,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0642389982,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0481540232,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280840758,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015054485,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078052982,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039800534,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020422726,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020285606,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020306034,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0616075902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0642119622,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0481311506,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281789516,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150714268,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078054664,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039815864,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020244094,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020417598,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002210848,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0616147872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0642758972,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0482230986,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280744716,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015098133,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007799688,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039746034,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002047196,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0830676862,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831013978,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0623502696,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363611332,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195052078,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100933424,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051372752,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026489152,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015160389,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086290845,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0830689104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0641525642,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.048115407,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028142259,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150694266,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078032366,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003977292,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020242852,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0830624002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0688190034,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0525548244,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.030802694,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163611072,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008562706,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043691756,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022350426,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0615843222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0832123938,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0623354458,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0367699186,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195162966,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100846838,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051348766,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026610764,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026796608,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00266886585,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0616055716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083109064,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0623107596,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363798446,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019485523,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100965444,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051477488,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026320336,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001340192,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086244965,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.083055219,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083100512,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.062339401,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0364111954,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195692982,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100826502,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051372832,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026431004,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014862309,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077293205,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.083053788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831499128,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0833342784,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0836836686,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083249704,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0835302524,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0832012728,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083241823,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0830597714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831703224,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083160328,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831273964,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0833215826,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0832132934,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831553466,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831949952,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08306748,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083120229,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0623688444,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0366781262,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195015464,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010094298,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051458192,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00263138,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026739771,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026582298,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0615737582,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0641180754,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0481471664,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280993162,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150602456,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078055712,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003970716,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020241104,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730929544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3734311356,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801822708,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1634401904,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875692792,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04530094,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230492434,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116848646,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116614698,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116876067,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3731386562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.373521729,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801635756,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163451027,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875731064,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453183566,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230736054,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117177884,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116690559,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116834264,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3721705172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3733058998,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2800284464,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1633107238,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876279574,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453230356,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023082724,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117450316,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116240037,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01163812945,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3719684762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735450166,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.280495242,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1635287268,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875988488,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453751532,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231239738,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116663558,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116668988,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116881758,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3722333808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3734143214,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801051122,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1635079618,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875022434,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.045281127,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231939226,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116443686,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116310196,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01164383315,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730205164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3736040172,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801981448,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1634618916,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876039362,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0452949408,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230505558,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011672916,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116843677,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116755228,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730874538,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3733890238,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.279994623,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1633914952,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875169328,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453565026,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230333554,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116417358,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116310295,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01168214285,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.373239682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735919046,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3734942014,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3737879906,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3740627184,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735058692,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735139662,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735443814,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3736846999,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.373643705,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730814062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3733531404,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2800114566,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163486349,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876147646,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0452082686,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230420078,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116768834,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116465645,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116739759,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3729656728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735673036,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2803040582,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1636547216,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0877833916,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.045281142,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023049264,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116778512,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117018517,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01172964755,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730386696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3734235836,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2800173642,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1635116192,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0874939002,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0452704834,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023162007,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116993118,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116233332,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011662135,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.372980773,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3737430448,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2803123298,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163509551,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876315362,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0454345156,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023071772,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116813078,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116734842,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01169230445,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(1.41421, 1.41421)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3720219334,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.373403554,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801706038,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163398705,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0875446794,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453688404,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230580384,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117287504,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130571192,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116515605,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3730506822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3736703244,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2802158044,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163437476,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876258716,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0453300676,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230973398,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116797454,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116765771,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117283994,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.373076847,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3735565748,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2801126052,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.163545374,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0876045336,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04542749,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231740478,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011672862,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116781317,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116877968,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0769014772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079466995,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795886372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794240824,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795129228,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794596408,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079468167,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794585278,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796931103,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796646225,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0783555928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.108703945,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.076913079,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794125094,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792983296,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793188196,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079280676,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793267788,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794054282,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794289286,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794179273,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794478942,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0769041122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793313532,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792918128,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793021346,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079357466,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793236008,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079395866,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793221538,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792950534,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793116281,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0769295198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793379696,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799390652,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793572054,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793616524,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794013786,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792934002,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792917608,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079353644,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07946250715,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0769296962,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792963618,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795118862,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794593014,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793480284,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07932326,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794007194,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794447958,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793552208,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07935593855,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079558545,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0843587658,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0819678248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0921138572,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795935792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0851532442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0783925822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747237784,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747244594,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0746707228,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0746193086,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074726455,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0746394282,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074688464,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747377454,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748639789,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0768857078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793655794,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793590612,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795456356,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796049362,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793470752,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793021806,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793072018,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794279261,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07974014615,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795992118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084635567,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0769212126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793335374,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793413078,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793553624,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794210714,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07933003,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793143646,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079459381,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079289392,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07953560365,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079446631,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084552775,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.076903118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792794714,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793939442,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794169152,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795745028,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794069948,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792939024,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079407267,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793096179,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07947324225,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0819094368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0936870566,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076839196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.089853759,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794926432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156590784,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794620278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.113370439,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1136578344,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1134258194,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1134657838,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1136208508,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.113660922,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1133893064,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795322908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1051229564,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.081887282,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0944294672,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0768193782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1134975802,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0818500096,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1053621032,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0820238302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1057867832,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795240512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1051253168,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0794848436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1053944776,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030040246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002563382,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016317566,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013421014,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013539518,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015088762,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018955072,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021287628,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029960958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025562804,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016186702,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013629884,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013440328,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014317314,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001876702,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020971976,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029808186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055442712,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055442206,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005544712,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055409648,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055624256,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055561778,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055495206,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055713382,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0025648606,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025809932,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025484882,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025781258,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025636844,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025868118,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025639614,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025815942,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003007303,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025773564,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016132592,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013764306,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013469114,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014219822,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001757554,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021643794,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030042314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025711604,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001629201,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014092246,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013602524,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015580916,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001873822,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021178516,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029911142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002549592,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016229772,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001388778,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001338626,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015393004,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017791624,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021389294,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029938628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025599878,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016066366,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014547682,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013471596,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015477374,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018621092,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021494756,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029899932,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002558845,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001625513,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001372583,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013511414,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014744942,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018327238,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002124094,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029939108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025598672,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001618025,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013417754,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013962378,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015217774,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017769462,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020741056,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029988226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025520404,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016118584,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014524054,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013429436,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015326074,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017494024,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002115954,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003010348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002571665,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016150182,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013525076,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001356505,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001512676,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001816426,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002170805,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030121918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025682928,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016253778,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013517944,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013994778,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014904162,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018820654,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002101461,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030128872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025613098,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001621156,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013489852,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013730828,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014788948,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018246314,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215112,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002997256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025621252,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016101052,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013269354,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00133986,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142324,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001867572,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021176272,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029919936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025600072,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016424758,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013567214,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014215174,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015252178,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018885896,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021439928,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030083948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002589308,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016286882,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013536216,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001305325,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015052442,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018924788,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021527612,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030027018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025689982,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001605294,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013833916,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013641356,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015371242,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018662662,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021147134,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030524108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056682288,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031368996,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026570916,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018627766,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014449172,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014857302,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017924568,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018177868,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018821334,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001872324,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00187395455,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030520828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056731886,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031207634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028859588,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019676434,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013875096,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014306804,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019331492,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018842494,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018933562,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017301366,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016330754,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030567852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025693348,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019539924,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014245028,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014949548,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018718344,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018176584,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001893094,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017181494,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163908755,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0025357722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005563785,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002990778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025547154,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001702953,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001445887,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014430338,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018789152,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001835562,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019032648,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017070054,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016322243,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002980731,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026129366,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016149722,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015377512,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001333855,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001953129,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018127994,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018898378,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017101493,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00162552775,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029905094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026223302,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018278576,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013981936,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013939854,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018555016,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00180366,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002065866,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017156347,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016299266,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025152836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027370578,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026291552,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026567108,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028503138,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026098346,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029408236,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027428082,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030779722,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00264253795,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030297326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025606566,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017391808,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001430514,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014594042,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018620718,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001830843,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001873471,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017107886,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001626856,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029891744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002682311,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019436288,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001595682,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013763366,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019421,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018847104,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018892268,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017144918,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016866603,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030277952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056512418,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030502234,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056737196,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025864888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029268196,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030546832,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025573404,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029967066,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030680024,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002983903,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027502864,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024990794,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024962016,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0072769296,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163189934,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163286978,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163323704,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163207606,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163108742,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163213598,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163087202,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163303044,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01634012095,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072923206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195302448,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723239658,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748030652,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375691434,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189174848,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096177778,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051056576,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027945012,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020484456,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021374813,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002092426,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0722928792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747463128,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375680496,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018886631,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096001568,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050859064,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027123148,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021443634,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021211345,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021219308,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072262276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0773866892,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038889277,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019561633,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0099072348,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052903662,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027929484,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021219392,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022630245,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022243096,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723109528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747347248,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375997612,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018879966,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095909572,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051971188,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028806366,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020628506,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020767815,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021415726,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0722554542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0766585264,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038562097,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019415631,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098400888,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052006578,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029259556,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002125939,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021032047,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022226073,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0724079412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747607396,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375908146,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189027246,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096027914,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051653516,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028473102,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020732822,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020296943,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201315145,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072771026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747668448,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375838952,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018989271,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096075382,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050537702,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028986906,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022039492,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020806286,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205597125,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0719519874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0759994194,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760780332,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760163552,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076036669,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760235308,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0765405668,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760461594,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761119775,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761593094,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723075324,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748161056,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037884365,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189391712,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096090372,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050818728,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027386602,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021121068,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020764454,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00209817795,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0724686992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748347866,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375772548,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189198128,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096104898,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051103746,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027684804,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020962188,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020281844,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201422025,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07228797,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0766789988,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385006156,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019410215,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098217082,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051825918,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028538284,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002084559,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021056684,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00235775605,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072309444,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0767324028,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385159748,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193956816,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098361514,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052306138,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028763228,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002140743,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021700633,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213038705,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723387192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747993178,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0376092634,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189530552,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096295336,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005127741,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027708648,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020796358,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020626567,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216285695,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723091988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748083938,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0378174112,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189359638,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096056902,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050674718,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028612884,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002151184,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020790938,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00330938345,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0719443822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647499704,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.064765287,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0648057548,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647351266,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0648152086,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647533588,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647256506,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0648871057,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06481684285,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072319442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0767069418,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385144952,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194247768,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098196938,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051848218,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028786694,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002175474,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002123114,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227017095,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0723076216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748057454,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375766358,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189257086,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096128878,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050867918,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028302042,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020926904,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021304849,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00210680485,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0722205306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0766832472,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385525364,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193984052,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098186498,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005278129,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028127034,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021041596,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020881046,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00224245135,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0722511562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748352202,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0376210048,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189425484,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096006566,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051851362,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027095254,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020905624,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002057927,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00212521735,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072267306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074807791,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375934372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189250894,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096433028,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050350368,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028074134,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021627098,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022004051,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037294526,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.024828716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081448576,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042380478,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022854482,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014296402,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015285038,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017343058,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020054668,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0246844612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081460938,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041938508,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022734352,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015377856,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001530311,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017117596,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020379154,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023638559,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082702428,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043123792,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002255051,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001394312,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015109026,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016935554,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020138606,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008896059,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043336405,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0245341512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081396936,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042104334,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023018306,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014611016,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015134796,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017169874,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020206448,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.024067257,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080441624,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080479436,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080477972,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080524542,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080406302,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080329152,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008044986,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040410158,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020243407,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0248149162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008180304,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042556402,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002241688,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014960348,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001549884,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001688863,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025453756,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0245071482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081792048,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043281306,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022516588,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013767034,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015106042,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017540372,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020302274,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0243449896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223353172,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022067699,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0220797758,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022089622,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221002424,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022152951,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221195004,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0172181873,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00872645465,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0245392144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081663642,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004213225,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022951542,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015899672,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014940956,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017040174,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019738674,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240890836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207676518,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207672888,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207694234,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207711926,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020771453,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020882303,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207841306,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104228428,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00520311695,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.024682046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081496288,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042411372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022924432,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014828822,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014996576,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017447036,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019887214,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02464674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207796086,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207825096,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207950634,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207760372,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207890442,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207680042,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020811921,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104327529,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00521020445,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0248527848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081330158,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042585922,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023265154,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015507356,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014899316,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017096558,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020192164,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0237893336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010334759,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005387254,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030930772,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024973396,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021978576,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019820972,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021691242,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021185968,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214150725,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0234048638,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828773,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042644312,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023097616,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014237392,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014908074,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017114532,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020387034,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009668375,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049944325,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0240106808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008148589,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004240444,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022500898,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014672268,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015171704,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017697224,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002027346,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001942116,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019993672,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0249119678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008149224,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042851664,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022244796,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014632556,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014857816,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017249404,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020019582,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0242055262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010340121,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053860436,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031084644,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002427056,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002161578,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019812096,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021386948,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021215784,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002139914,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0231589886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082631182,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00427775,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002311641,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014963596,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014638046,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001658165,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020047654,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009835768,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005002481,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031552036,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163222126,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087319176,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076385798,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077527644,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073999262,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068249894,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061641418,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0315288908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016324904,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088335158,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075429368,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077669568,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073104964,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006850724,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061651396,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032834954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162727782,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162980576,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162844222,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162883314,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162868528,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016310527,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162696004,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163710667,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01627647805,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031651826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163070864,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088092426,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075428764,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077455066,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073147426,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067141176,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005994806,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0313948254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163251188,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087590386,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077725014,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077461462,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073507178,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064704824,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0058632948,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033478042,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016711328,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0315359126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016294794,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008798915,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077368126,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077718288,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073643642,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068286804,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059684124,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0316217756,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163390066,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087496168,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007563876,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00775355,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007377138,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067342574,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005910462,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0317970262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162711894,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087134428,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007704624,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077490152,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007339098,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066650174,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059361924,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064992236,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063610332,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0316758512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163415134,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087923468,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077118454,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007755135,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073510288,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068212386,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059627378,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0314777234,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163124668,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088275808,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007662641,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007774363,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074020044,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006516054,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059543038,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064235857,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00637355445,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0315166748,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162835268,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087852016,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076136234,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077614206,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072679104,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067907618,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059546304,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0314467914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163433028,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087712956,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075805174,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077682554,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007190347,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067943418,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059056686,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0313714158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163201032,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087211916,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075881256,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007792489,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00738483,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068029926,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060303768,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0329109224,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1145447436,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.114490003,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1146669086,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1145260766,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1145037946,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.114527942,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1144277122,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0571543761,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0286371387,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.031445968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163383978,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087851014,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007562888,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077458836,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074232982,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065224362,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061246398,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006447044,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064194571,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031676136,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163232834,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087191358,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075433738,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007732084,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008315422,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067459636,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005982822,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0315754534,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163160678,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087750066,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075927618,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077656464,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007310519,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067203032,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059647756,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031603051,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016290009,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087448362,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075726404,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077315328,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073473602,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068348966,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059417246,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0311539502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016358812,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087212362,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075793774,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077494964,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007240263,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066749218,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061163702,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0315274086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163735174,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087629178,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007601619,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077815506,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007276691,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006582208,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061458954,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054160804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158729682,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081112094,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043655448,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032218012,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033266838,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032146656,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037442476,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036984257,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036688862,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054290528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158994922,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081455232,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043411652,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032674288,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033209328,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033431526,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037467336,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036943921,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036373419,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054222398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158836182,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081363902,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043119668,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033173882,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033084778,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032434496,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037375168,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037161178,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00369622905,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0054043462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158790796,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081157658,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042897108,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034374538,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033729508,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032468706,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037096224,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054266924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158586574,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081334866,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043176712,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032659598,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033055344,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033044256,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037676692,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033945916,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00347367235,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054204764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158502094,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081481966,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043316956,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003337458,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033951242,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032633052,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037040398,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034053656,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00323370665,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00543113,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158474626,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008109538,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042366234,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032768244,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033087826,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033512602,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037203366,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0054358254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158576536,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081097804,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004244127,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032976972,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033056,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032931888,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037774278,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054253396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158515144,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081044928,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042495672,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034414716,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003297221,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032206528,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037622734,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033974219,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00323358835,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054214364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158510634,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081173446,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004234932,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033203542,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032939024,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032903796,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037346816,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033759886,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00324862815,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0054160724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158495904,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080965694,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043346154,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033368548,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003341323,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033375824,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037489744,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034073907,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00325005025,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130463668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103776836,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053822084,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027518034,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001534521,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001140668,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000592169,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003723988,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003805253,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039771945,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012788902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104189412,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054542098,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028654646,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001630878,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010776138,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006679092,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003956904,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004284012,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004028589,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0106755742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022669491,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.010625196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223945492,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0100370204,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102755762,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084190346,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073982716,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075236768,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086067208,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104047906,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011396782,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112706589,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110966566,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0101143376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218788654,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.010654338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0224760962,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0102129094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0277838876,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0106347954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0224274572,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0100779682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0108853192,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090294788,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074535702,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075351348,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091711148,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01044557,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011430602,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112983597,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01113546395,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029254094,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006500864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029236458,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006490958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033250108,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021744842,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014421422,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010177116,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009115004,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008096996,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008494398,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004406484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004608352,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004315718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036094646,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000650811,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029299254,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00044033,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004634396,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<,  Synag",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004351772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036245238,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081196846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080786018,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.008131389,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.053869668,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081136212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080849916,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081298336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008086707,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0080992424,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0539413404,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081099926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080764432,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0080997032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0434836668,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081409102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080940624,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081230288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080785514,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081223976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080841702,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000973951,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009899254,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006156112,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000330326,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003100554,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003440192,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004682028,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000537268,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010616005,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009723254,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009766784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009953332,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000611281,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003266648,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003140474,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003441774,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004575244,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005518456,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009753582,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010662786,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000615645,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003480614,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003073848,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003515822,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004665632,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005558892,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011194305,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013487821,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009752778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009935242,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000594327,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003263724,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003129854,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00035369,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004571038,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005584018,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001077024,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097274755,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009749452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009925082,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006168676,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000331532,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003070402,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003536278,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004645594,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005930928,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010724157,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009689201,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009755742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000996648,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005877926,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000337918,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003128188,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003549038,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004648482,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042421712,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010793125,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009702755,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009754938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099119,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006131386,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000326266,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003100478,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000349652,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004724922,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000535679,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010582538,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098137295,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009754844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009890538,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000614922,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003334996,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003150548,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000345985,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049879,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005384448,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010583642,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00228425565,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009765624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000990232,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006154086,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003313536,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000311064,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003510746,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004588066,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005466756,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009762074,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009884688,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005905922,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003294762,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003076594,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000359328,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004673126,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005397308,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010432918,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097085185,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009762656,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009897328,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005839038,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003310208,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003121694,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000345412,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004650766,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005356612,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000975416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010475192,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010397366,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010442432,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010404764,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010421212,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010423838,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010419672,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012894394,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010838141,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000975302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010512204,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006462826,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003377736,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000312214,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003624318,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004596042,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000533958,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009746144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009903504,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005851438,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000329436,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003077456,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000346576,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004685352,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005354446,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010345102,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002478601,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009756306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009930454,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005995588,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003263184,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003049062,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003524494,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004687036,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005555842,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001026959,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009909835,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000975783,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009900494,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006206084,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003301134,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003049922,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000349019,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004590156,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005342966,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010233624,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099945625,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000975871,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009967522,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006125474,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003352268,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003127912,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003543164,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004672606,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005520156,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010398954,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009798411,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009751798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009850788,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006131226,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003218298,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003065894,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003516418,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000464668,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005393782,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010428973,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009799193,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009746668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009895166,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006205744,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003308248,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003131518,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003528264,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004646578,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000537326,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010394846,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009725644,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0009762798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009913722,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006050072,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003322894,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003129574,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003515218,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004474958,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000533896,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010465698,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096610565,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033451844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034618514,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018430778,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001051685,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005274234,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003764646,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014612796,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014774186,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008866734,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004658484,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003097412,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000354014,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004810606,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005464212,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010244434,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096263935,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014620152,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014744064,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008996624,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046703,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003158426,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003544992,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004620732,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000551331,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010364841,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097414275,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014609814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014734786,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008592664,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000465211,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003092622,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00034761,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004621872,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005556246,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010365533,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097482615,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014603382,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014732222,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009008544,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004633112,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003172454,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003560864,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000456577,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005392198,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001031498,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097223635,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014610396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001470978,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000878286,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004664176,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003154498,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003500428,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004651204,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000562682,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013064961,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009852933,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001461983,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014685092,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000876841,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004649626,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003122338,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000357104,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004606142,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007206142,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010280243,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000973397,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014624036,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014696692,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009011206,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000465848,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000307653,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003584246,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004862682,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005314052,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010147122,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094656155,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001461851,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001469938,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009064788,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004638666,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003065816,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003547896,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004575282,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005332624,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010348279,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096819955,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014602434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014755726,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009001088,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004652528,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003108294,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003553768,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004591556,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005365166,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010319563,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000978253,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014606566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037086924,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030291014,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025151528,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024492874,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026572898,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030362828,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032044904,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001461083,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036033406,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029616384,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026056248,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024039206,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028197346,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030270856,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032103434,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014608852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014728614,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008717358,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004585762,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003121756,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003470128,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000454583,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005418204,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010450671,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009828165,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014606284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001471533,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008908034,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004634818,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003175938,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003575084,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004602536,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005421292,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011975972,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00153057405,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001460033,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014703206,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008860264,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004634616,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003156062,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003695752,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004260037,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005937498,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010348471,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097003685,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014615722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014734644,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009004454,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000464257,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003105224,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003604942,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000462071,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005443832,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010309846,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009750882,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014608472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014751658,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008904426,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000466239,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003157642,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003537436,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004758466,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005806952,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010309186,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097532115,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014603418,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014715248,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008908994,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004656452,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000314548,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003569576,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004647862,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005982964,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010260866,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137573,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0014615358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014674454,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008903024,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004645376,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003191428,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003490146,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004548432,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005437178,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010531684,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009727137,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013259818,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206293456,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105143718,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053882936,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002787141,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017790922,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011376562,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010651004,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002021685,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00189817965,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000891709,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199585816,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199250644,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199255516,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199299876,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019965885,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199298814,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200171474,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013224332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02063564,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104966522,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053485956,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002808896,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001693287,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010757522,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010869552,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022629173,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019110938,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013211306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206268992,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105104842,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053380414,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027231774,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016167956,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011292864,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011501296,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020497107,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019045957,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013220626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206182486,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104830288,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053629884,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002793397,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016511758,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011385202,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010806354,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022441832,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018900744,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013256754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206263442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010422774,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053554042,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027228166,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015928988,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001174648,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011466616,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020302852,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191012495,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013260962,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206190866,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104746992,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053788482,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002783909,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016708088,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010868652,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001111165,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021944832,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191233805,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0008778794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199589002,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101635814,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005267329,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027210034,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016716424,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010752534,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011489694,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013194332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206167742,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104367138,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053800564,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027758104,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016950924,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011368088,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011055846,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022324161,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019503805,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000878929,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199474646,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010168152,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051766066,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026532382,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016464114,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001104402,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010732838,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013241126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206233626,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105170924,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005344169,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027681798,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015490826,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011109582,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011296952,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021559538,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019186062,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013230784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020631574,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010573701,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053194808,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027819254,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016840754,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011155896,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011952844,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020049466,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019071971,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001325579,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020704653,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105248972,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053732516,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027019678,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016284912,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010936138,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011081592,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021818646,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018846382,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000882474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206145222,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104505878,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005356408,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027076222,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015571758,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001153925,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011324908,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013189166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206269296,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105090678,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053662086,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027258462,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017265676,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011318656,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011689572,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020064215,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001922612,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0008978342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199486626,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010142929,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051423624,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002693596,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017159594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001106416,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011299122,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013192454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206156622,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104849584,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005382188,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028025386,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016989558,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001109612,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011734338,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020404083,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00191177645,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013229502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206213046,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01042622,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053521976,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002797515,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015970364,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00110752,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001097972,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020062945,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019008436,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000656047,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006678032,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004258888,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003141556,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003110856,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003514414,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004699018,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005335672,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006567062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006698226,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004208678,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000318259,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003042928,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003468108,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004615402,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005334284,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000658249,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006699788,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004260314,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003114982,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003053008,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003490068,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004619428,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005273032,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010334576,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097778765,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006575458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006704196,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004249874,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000313444,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003091202,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003548638,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004652632,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005338198,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013353533,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009766972,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006558568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006733536,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004090276,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003116128,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003054992,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003496838,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004436584,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005562356,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010308394,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097282285,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006569064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006794308,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004233342,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003100556,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003069926,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003454464,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004600052,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005422072,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010331767,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009676812,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006577376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000671281,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004076006,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003148386,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003088294,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003529142,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004522246,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000587598,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006577176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006727744,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000422712,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003181048,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003071342,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003442416,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004670282,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005478738,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010596516,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000976348,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006573112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006684062,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000422739,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003107592,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003089718,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003564786,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004629122,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005369616,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000657586,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006692914,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003977372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003187122,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003059296,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003563886,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004584782,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005186072,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006578104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006751788,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040836,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003151734,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003024698,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000351223,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004629524,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005288422,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000655594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006676366,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004214136,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003103382,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003071282,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00034236,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004623496,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000546367,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006556622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006645668,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042216,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003088532,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003070098,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003549198,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004670806,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005551096,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006566404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028300578,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025783436,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024137388,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002470966,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026964594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030104862,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032096342,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006571124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000671257,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004154586,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003182694,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003062628,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003428914,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004587786,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005375462,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006586694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006628992,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004259262,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000320649,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003070322,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003455004,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004672846,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005431834,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010977551,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000981705,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006561794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006682962,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004200362,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003181752,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003023832,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003550684,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004532706,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054779,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010411,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098509365,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00065678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006796752,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004201924,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003191144,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003088254,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003486564,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004676034,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005408766,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010454961,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009809745,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0006571406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006691552,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000412083,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003115326,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003035854,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003540064,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004606762,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005341322,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00103018,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097258485,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015128986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015069654,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009259594,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004789866,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002476826,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015161644,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5334186566,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2679231586,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1342277178,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0671773854,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034654962,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192301076,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020177377,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205150155,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01992577485,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001513101,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015062282,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008940914,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004814776,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000251037,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001512428,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015055528,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009099292,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004783936,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000245795,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001513097,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001505158,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009198858,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004697876,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002509308,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015156696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5329858346,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2671885922,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1352828332,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0672367284,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342538772,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0188110278,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0198954782,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015136478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5342531884,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2676058656,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1345357938,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.067330895,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034420971,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191582972,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020286694,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196729188,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01941814495,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015158578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015061058,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009299046,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004809284,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002467452,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000236207,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015157984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7991563412,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.401814797,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2005337684,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1010129492,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0520313546,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0270550462,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022530631,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013278993,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082082035,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015126848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015059164,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009184994,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004730486,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002509082,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002423682,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754669092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196126984,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011949713,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120075138,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157810172,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.025263282,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0233481312,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0230773714,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ",",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754111098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197537026,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119234252,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121135802,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159377748,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0241093528,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024850046,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209515738,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754141458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197418978,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119225496,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120272076,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160372436,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0241691666,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0225878398,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205100492,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754860028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197148892,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118267152,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116388434,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157848978,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0265519854,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023459455,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0213067562,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754168526,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0198650732,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119029528,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115837558,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0167162176,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022075262,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249097882,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205455578,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754228954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197751884,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119025158,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116899088,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016411125,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023424265,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0247431134,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0217890592,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754139852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019752522,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119327384,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116480742,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0168186676,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0240469296,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249022444,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205277704,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754151412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197568922,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119887252,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116630804,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147130648,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024890891,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249075282,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207664846,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754554852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196858346,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196428044,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196346712,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196339176,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0196125456,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019640584,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197983644,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1086743632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200444078,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120158968,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116981422,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180850414,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0258340046,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024850639,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0210861844,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754466766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0198344458,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120601456,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115630512,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0166958952,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0241894286,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0225028944,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218028148,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754279614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197574914,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118916482,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121034794,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016825793,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0212797312,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223711308,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0216546986,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1086965896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.867898119,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.0327469778,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.6906453972,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.9797970456,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.6346846402,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.462479764,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4252653518,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0754479446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0198479692,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119297476,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011725226,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0165852294,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249940058,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024762403,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209064388,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1510900326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012278872,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013624582,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017889242,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024969908,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003717132,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00442184,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044818826,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<}",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0057888146,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057924014,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0057688112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059319832,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0057692018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057796414,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057787014,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057662502,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057896422,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0058081906,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0058015842,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057768142,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.005841742,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057929884,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ", 10]",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006132766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061102958,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061035694,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061111752,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060987458,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060972612,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061062462,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061033024,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006101333,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061466078,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061218854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118804496,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.102486177,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.128599611,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1076299148,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0785331638,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1506016152,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751868598,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061168898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061300446,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061357654,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061218414,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061004352,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006097131,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061109988,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060998138,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006133375,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006100928,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.042454047,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037510522,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0417874462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0414319832,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0425411486,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0374829824,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0422993808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413629598,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0426100998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.041432765,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.063731151,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.077187419,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636956988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0771581218,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0637602978,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0773481382,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0631534674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0746406486,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0637299652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0773027028,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0637357458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0772086206,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636364074,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.077166666,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0635663638,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07719769,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027196622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018732506,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001849532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027184582,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0018541666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027184036,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0018408738,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019336414,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019329666,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019319804,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001929995,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001933786,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019340386,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019359218,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010836491,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082105415,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002725149,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018712446,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027232572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018706558,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027209126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018699282,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0018591678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027217624,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002557798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012989972,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012993858,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013001352,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012997584,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001298011,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012991816,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012977748,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013078227,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00131154335,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002558122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010661542,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000682348,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003569956,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000201326,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001859148,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000254656,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002973516,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002639647,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024351645,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002560864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010666632,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000256471,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009897308,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002557518,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010657216,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002557502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010607538,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002575994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010741516,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002558422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010704704,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006754034,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003388014,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001895876,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001804486,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002657946,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003092742,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002612378,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000241707,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024401,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003399118,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000208517,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001603546,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001463202,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000181368,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023559,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028873,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002990899,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029164065,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002923746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009830662,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000612597,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003098814,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016532,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001774164,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002440458,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003041668,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000300249,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029407615,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002921544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009820504,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005777162,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003122054,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001675204,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001018587,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002447532,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003125106,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002938609,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002984124,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002918714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009829984,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006081008,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003108834,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016298,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001807874,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002467208,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002861004,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003082226,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007824926,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002919994,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001382105,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008685572,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004363064,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002233852,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001862738,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002739498,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002908618,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002994363,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003083491,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002442062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006835602,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004187032,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002156486,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001540274,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001804344,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000237714,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002963802,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002974106,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030347675,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002676178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009828238,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006071512,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000309,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001666708,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001761906,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002416714,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002882428,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003012087,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030469875,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002460256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003471674,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000212196,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001580926,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001448218,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000174926,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002416872,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002915092,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002824445,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028529425,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002446852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149047,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010175432,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001432586,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002658221,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046507554,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062914392,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077682682,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0079497817,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080973635,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000292096,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009819282,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005843684,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003115666,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001621846,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001809776,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002436452,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003005318,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002885648,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00032126475,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002430562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000339665,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002114286,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001564154,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001447338,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000169666,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002476126,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002882946,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002539399,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002334761,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002447874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006770144,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004225326,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002158066,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001533118,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000199462,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002328286,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002790452,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002992891,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002889707,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.7184e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003718498,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003276044,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000317165,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002992836,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003444424,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004491104,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005522058,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005430357,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005435019,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.716e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003703524,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003234788,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003185236,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003047216,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000347566,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004555604,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005428288,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005642459,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130672505,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.0303e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003731722,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003310532,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003144418,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003021672,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003448668,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004452492,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055104,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005414191,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005961638,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.0105e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003720442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003278292,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003168966,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003019084,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000344643,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004380556,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005448904,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005564009,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055020785,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.05916e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003670768,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003247772,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003217956,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003035914,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003491412,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004694508,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006111142,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006428222,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165645045,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.0329e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026022622,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015434416,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011685422,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018569316,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022429418,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024296944,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015481204,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015603822,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017320926,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 9.7302e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019510944,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011296334,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006122124,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003267712,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003393986,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000440498,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000569751,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.02674e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003635602,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003248014,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003143434,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003043788,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000331474,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000451263,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005462108,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000545121,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054178425,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.02772e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003913082,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000328765,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003206016,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003001014,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003385414,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004639844,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005424738,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018431711,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006081877,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.6966e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003713866,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003277452,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003167322,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003034252,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003366936,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000455683,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000555897,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005591602,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056212025,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.0137e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003721982,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000320055,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000317858,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030482,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000352387,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004697934,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006084696,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005496714,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051857555,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.02472e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006718002,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004222446,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003229696,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003045054,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003340104,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004627478,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005527328,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005521252,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054380985,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.01054e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003505536,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011337936,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012264526,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018101242,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023126428,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023041088,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001661313,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015038351,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149715795,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.01768e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006697826,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004247166,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003263822,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003040906,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003354214,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004686954,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005466378,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004919824,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045297305,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030244462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031141442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021335688,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021539754,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022869924,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002126117,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021098802,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002344793,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023658739,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00238234415,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028017864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003110013,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021138136,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021440122,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022882272,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002162313,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020852682,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023586752,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023519567,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00224113045,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030973286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003119302,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030441694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031477778,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030313596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031315896,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0032262208,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031337532,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002983313,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003128289,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002121939,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021121088,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00230811,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021936836,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020882458,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002349656,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023401317,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023585918,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003059137,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031479456,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003136825,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031580666,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030595014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031487994,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0023584624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031270046,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003739828,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045826494,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057256576,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006819648,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101237166,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0182649598,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0183580763,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190891357,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003062928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031231592,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030462952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031212536,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00299018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031261688,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030039396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003089704,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030521702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031329198,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001300265,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013120054,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013468744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013108852,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001347382,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019603136,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001349598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001956769,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013469866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001304203,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013502166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019634816,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013700602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013164336,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013426828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019584564,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013456164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013079058,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00134323,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019628986,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001347508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018242626,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010487156,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005898726,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003473098,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000345278,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004815556,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005865502,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004952523,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004545522,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001361412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019621454,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001334011,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130076434,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0781834988,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1856394356,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3270979792,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5984381662,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.8577474278,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0448565984,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.043038219,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0507943991,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013493556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013151212,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013485842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019672868,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0012986826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001794324,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011034822,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005821978,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003362552,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003398174,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004820644,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005538752,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004789509,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004810599,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ".4.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134055542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160217926,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143657096,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146771234,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149634732,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139647552,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124263486,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103510134,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0136111838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157914492,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014225787,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145809654,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148724162,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138663588,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125065238,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102193924,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0139717964,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0388521162,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0372755108,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0378536636,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0376732092,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037603234,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039009604,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0402874616,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0132582458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015497,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140613768,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014455418,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147597732,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139323488,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124075016,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102155894,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013788611,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390272156,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0371882434,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0378730282,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0379065008,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0374132574,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038617224,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0402309938,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246680408,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01824149905,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0132984712,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015447551,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141094326,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014466679,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147430998,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139293832,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124365524,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104075016,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0143221358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0392221778,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0374579732,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0379164112,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375955314,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0370839974,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0392615712,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0403793304,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246163057,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0167370893,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134258746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159333024,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141584306,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145051774,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01479025,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138781526,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122544446,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101819682,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.013891508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156628514,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014170888,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145256754,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148133716,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013892261,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0123803448,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098510498,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134027696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159499766,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141252332,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145315874,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148088316,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138807706,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012318758,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102900504,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0157294828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0699693084,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0688107484,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0692827524,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0694507672,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0726430018,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0806526944,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.090206287,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0525885571,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03249697805,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134326158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156414516,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140959424,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144194992,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147135974,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138859142,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012274267,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101512818,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.013457479,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156067236,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141233184,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145588434,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014798154,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139331216,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012448819,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105285482,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0152018698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038896118,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375324936,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037940836,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0380179246,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0374262582,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0389128594,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0408180458,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245494447,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01663811585,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.015580354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038919696,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0388524402,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390945684,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0388755574,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0387889246,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038854029,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038896599,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0133954168,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0155800832,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014153353,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144938254,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147610146,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013832535,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012443407,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102371036,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0148310622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390131856,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037766633,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03788972,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375832874,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0375881286,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390424016,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0406246076,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245808556,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01700101785,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134824104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156130398,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141731404,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145300598,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148117362,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013978439,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124507566,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100957144,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0135192012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0155220814,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141273602,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014523722,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0147847378,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141768594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012337304,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010073078,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056534664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356383306,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180334884,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092007134,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004547121,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024188116,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012544152,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007132846,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035905882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0304796922,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153212342,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008173993,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040559954,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023097092,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011017388,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006227962,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035561276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0306755302,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180238182,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077254936,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038941198,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002155163,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011619458,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005813148,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.005645373,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355790032,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017869977,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089746292,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045693602,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024088852,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001393677,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006936156,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056511636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292107592,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0171334334,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073762464,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041669018,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019291632,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009481744,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005065846,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003585555,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355179434,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178094192,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009131978,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004767091,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023915608,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012085132,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007132844,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035526588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0316276768,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178099604,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077344842,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038882544,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021592846,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010971122,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006362016,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056456892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293085944,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014826335,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073636786,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037101324,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019337618,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009666314,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005776596,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056444192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0314655186,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160832358,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083468678,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040799402,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020562598,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010956814,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007026124,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056420728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356759636,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356439374,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355874576,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355155706,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356881206,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355979472,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357274426,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056408128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035649488,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356408402,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035860474,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035709192,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356632634,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356056598,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360362618,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056433976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035589279,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357564366,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356177404,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355565054,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355830414,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355763746,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0359507408,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035465676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0307214144,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0307343944,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0323795334,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0359806534,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0305745046,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0309287254,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0317177838,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0312654026,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0314874641,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056454514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035593523,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178462092,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089864656,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455425,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023900458,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012099742,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006900504,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056436882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321726156,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161701042,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0079483014,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040093758,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022678744,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001230306,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006983942,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035529274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035494341,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0181503734,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090850604,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045381496,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023796284,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012087798,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000696729,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056418102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337332852,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158472664,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007918233,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040194528,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020478278,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010752312,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006233234,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319749228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319235322,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161240958,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085139328,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072284728,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069299968,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071749444,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061344262,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319055722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320002784,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319928222,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320023726,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032019933,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320256016,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320104972,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320436998,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0318947124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318310728,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160660268,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008428218,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070407388,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069453964,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071403936,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063183496,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319739566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318925678,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161360182,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008448137,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007065229,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006982187,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070147042,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062187668,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0318798924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320417984,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320339332,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320370174,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319997592,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032010936,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031978293,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032027303,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160268841,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080552883,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319837252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318830582,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319398932,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031925045,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318777962,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318695808,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319641408,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319484408,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319658014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319735904,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161532426,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084450394,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00705726,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068997062,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070067212,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060534786,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320125936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319445736,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320188408,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318767144,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031884333,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031874877,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318799742,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031941618,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0318712982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319261072,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161249212,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008515349,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071863802,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070039154,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007027592,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064258914,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319798198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319614956,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016120619,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084584424,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070457842,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071292448,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071833644,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062294928,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.032018043,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318809884,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031890522,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318853904,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318890212,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319169374,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319250552,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319020238,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320155028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319470438,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318745802,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031891672,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318837492,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318840514,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318755058,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319136436,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319914436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318801226,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318793388,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319476794,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319104432,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031900571,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319083636,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318936982,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319689972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319367086,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0162796532,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084310308,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007078796,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069989424,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070378614,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059429212,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319771684,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319160878,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016139937,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084900618,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072346846,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006961536,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071206848,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061701434,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0318828056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031817144,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160759168,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083707342,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007064421,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0068928032,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070676012,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059802354,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320123688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318846866,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319250206,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319008692,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031894884,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318743854,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319098406,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318828434,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0318902214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320354698,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032028701,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319697588,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320107196,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320349748,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321656358,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319980374,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160296041,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080334244,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0318997014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319027112,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160873906,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085093036,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070938764,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006942302,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071051918,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064857092,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319753104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319193518,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161861982,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083871572,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071039996,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0069330274,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070117186,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060977342,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "(5.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0985731416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1244887836,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1245816196,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1248068562,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.124199469,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1247219968,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1247814626,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12472356,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1005771754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1217693996,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127971044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274398484,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191757342,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150128476,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128182682,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120775784,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116639164,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112655332,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127166494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0278425684,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194071318,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150955366,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129365118,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0122193668,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115721578,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112725142,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.012491603,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.3288411132,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6733798676,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.3444334516,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.681183554,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3473687632,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1805499316,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1082724522,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127895526,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274459774,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193432216,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150450996,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012912529,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119978448,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011681267,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011302795,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127137778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0278752256,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193596582,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150538122,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129718684,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012139525,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116616506,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01139306,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0128099568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027568882,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194972526,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149239548,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012779131,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120616348,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115604386,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011311137,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127906442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274625046,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191003492,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0148802428,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0137538774,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121965578,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115870426,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011335817,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0125172698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279973298,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193644996,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149777714,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128311908,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01210821,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114784264,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0111797402,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0126007944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0275294978,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193204814,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149671902,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01278293,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118992924,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011351839,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0111039938,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001762659,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018305484,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018216036,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018317402,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00182185,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001838539,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018210644,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00181878,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018315715,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001822846,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0017801058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018254266,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0017633302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018357982,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018317726,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018217678,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018346116,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018167122,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018167444,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018262442,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018332475,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018257486,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.006589629,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066811726,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066549372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066603678,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066705444,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066838136,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066885346,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006735761,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067333715,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0067315534,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0043084108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042862272,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004302033,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042829834,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042863538,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00448376,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042878022,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044259514,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0796243568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801943058,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080224052,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802478046,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801866938,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803134054,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802668442,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802174494,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.07992798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080221421,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801379802,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802078256,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080518438,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801484738,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802409274,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801556112,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0797311352,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0804271084,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800860208,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080058317,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080075964,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080145388,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800916858,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801049886,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0797116744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273110686,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127302295,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12672416,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272428654,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1266441862,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273758608,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127173313,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795850986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127364229,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273607886,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1279351162,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272479148,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.128037793,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.126727559,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.126740497,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0797124278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1266563648,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127228302,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272071764,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12726358,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273632452,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272932742,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272076514,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795812776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802670266,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802170264,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802784578,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801950594,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801990044,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801646998,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801264102,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079633628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273871694,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1268666872,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274924248,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274090284,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273590948,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1277552584,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274530254,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0796467106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799949936,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080171138,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801783896,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801584684,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802544646,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080183479,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801664472,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0799025542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803393898,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801205452,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08019242,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800702168,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801848238,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801205052,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08022664,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0798883238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801023232,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080409698,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801413282,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801075608,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800962996,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801327742,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802265402,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0796338202,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273778822,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274237244,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1275123648,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127517811,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274351594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273837656,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1268697068,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.07971803,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801019184,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800721742,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801606588,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800930818,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801110898,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800514678,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801286604,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0795466846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801077102,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802722118,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080184317,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802784576,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801474918,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080233012,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802102014,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0793444754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274094372,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127145831,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1268627038,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274287076,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127440736,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274459182,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1271516718,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0793302726,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274818676,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1270981374,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274834982,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1271970008,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1275813582,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274241412,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1272669418,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079561821,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802458522,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801773236,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802087026,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080219575,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801719918,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801811692,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801239018,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0797362668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080194075,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801601714,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800787212,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0801143178,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0800645322,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803637758,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080094873,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0882738254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0249465458,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101804512,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051462628,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045784856,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040320106,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039259776,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038966204,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0858785554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0415137756,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031018333,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0258091744,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0246167976,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0265166654,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291064262,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031674392,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1028115156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1845419632,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1845944196,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.174892175,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1844814716,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1750378104,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.174996417,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.185087354,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.095664795,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0498189203,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1028885562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2384282324,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.114049651,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0571475878,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0286289946,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145005184,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078869772,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044061742,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1028801702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.205598361,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1069723798,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0512534398,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0257531188,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131937886,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006748038,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042750484,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005719071,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049726068,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1027974092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2120591558,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1032216908,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0515897428,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0259141554,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0137849818,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0070959206,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004505144,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1028805572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1430245516,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0680610444,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038750369,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0198982868,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119349532,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081165594,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082406162,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092836063,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090143263,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0986430048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1316892086,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0660573016,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0331299446,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0167074204,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096864448,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051250574,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004554687,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0986569688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1344784008,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803223796,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0527435954,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390719988,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339701128,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0323553174,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319107674,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1027796898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1235382144,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0738401264,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0505341442,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0372998316,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328108022,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0341489244,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355773182,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.098626041,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.139990767,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0772183502,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0397447086,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0241589586,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018782594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018674485,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0183455498,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159862594,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01322349855,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0985761332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2050515864,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1080652758,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0599673078,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360728856,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0252524738,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200423474,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0185048788,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.103257558,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.228030102,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1585412974,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1306746228,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.114461866,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1024999466,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1047862662,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1061852466,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0986758648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2043098268,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1048551694,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0512881828,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0264622294,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132359656,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006959998,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046971562,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057338778,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503787,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238110636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0524749214,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0490550236,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.049111291,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.049531646,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0524181292,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0566850264,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0614814706,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238017342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0381262622,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0330101978,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033056904,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0333341836,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0348391192,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355919556,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0359135466,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0237734272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0478610812,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357091636,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033306434,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337023,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0348035284,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0354018488,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363560596,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238668524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239105734,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0238677638,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.023880708,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239291432,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0238657826,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0238216674,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239142816,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238275846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242312318,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126155534,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104102588,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010917965,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093813444,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084659466,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077034722,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238461472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.036418658,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0330365654,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03304424,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337628118,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034166369,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0353998752,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0364216464,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238367018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242304442,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126030504,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103905038,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109144724,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093246258,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083863954,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076667076,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023799105,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242808852,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125669406,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103777654,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010850745,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093488034,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008437347,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073641942,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091999663,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00841394405,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0237860246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242051708,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124678868,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103908262,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0108362758,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094755208,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084178106,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072872434,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009202474,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008334608,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023816355,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245623114,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128282508,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0099799632,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107504264,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092095558,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008379625,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072388828,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091463909,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00948513685,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238686816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0527681274,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0490450226,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0493099576,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.049813817,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0525441512,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0572261532,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.062238318,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023844945,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024219426,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126799242,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104228958,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0108948236,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094106128,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083901206,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007804075,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092275986,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00936629365,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0238015154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239936722,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124849146,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100146924,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010769238,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092713338,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084508384,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077151802,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091793613,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082984733,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0237681146,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242910622,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012469478,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010447851,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109121242,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093239446,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086525728,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072474928,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091852201,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828939575,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0238764306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242591722,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127757086,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106057032,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109412206,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093576594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086149686,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074476984,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091573402,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082853642,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0238586124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239573078,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124712976,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100846718,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106905082,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091292754,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00839199,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075588628,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092225983,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831726355,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0238394428,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0527062734,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0491985304,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0494767708,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0499606342,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0528540808,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.057351017,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0622106664,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023720127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242409562,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124748018,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010441231,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010896102,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093927374,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008443805,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007627863,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093146761,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831279665,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.023830572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0242035762,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126632976,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104500514,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01084908,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093653504,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085938348,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007643598,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091994179,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082802094,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.215982233,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1774582466,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0828097182,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0411042772,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0228107148,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107902594,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056269326,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049470028,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056017596,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049446755,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2380903836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2362010672,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2427113188,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.243334492,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2441223222,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2433311158,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2424682872,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2426500398,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.237609237,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2475958706,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2579430368,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.246749643,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2470307182,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2579677032,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2464507564,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2578510062,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1323300768,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0683312071,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2377885594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2334389972,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1236771268,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.061475012,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0292761646,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157264946,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076671704,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052031436,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059907244,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00524516205,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2236054246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1675300966,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0841184412,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0421787488,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0229757542,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011757795,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057861094,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050352566,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071944793,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00633252845,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2160488696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.161881674,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0894803008,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0450531738,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208275082,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117772552,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055064486,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060824474,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054683152,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00698670215,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2262783112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1778776316,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.081904299,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0412356434,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223484214,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011712145,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056002426,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051859874,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2260442236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.16496401,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0819564192,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0445286252,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208002208,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010677404,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060891136,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005119254,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2262367074,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.164990618,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0820423662,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0445607658,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020766111,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0116644676,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005614465,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00499979,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2161470298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1873607586,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1117680926,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.068186129,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0462571804,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0376182168,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0349304304,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035676394,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2159653028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1771612154,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0902639688,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0449180282,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209316494,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01177884,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055442214,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050539032,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055310669,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0118666032,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2161359252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2581038462,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.125405718,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0653452052,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0318604408,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016861209,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008588587,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054852908,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063119138,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00648371035,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2160478938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1782149844,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0948776122,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0531990256,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0325027218,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0237773802,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190275692,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190565838,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0241949478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754329708,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0610903186,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.055562458,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0560252174,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0591844304,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0630770958,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06827622,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0241362072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0244049658,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0124735372,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0100454622,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106173892,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091821966,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085442614,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0076641582,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0097299274,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00919979375,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0241287194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0548812098,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.048976476,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0486432808,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0488088834,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0519654716,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.056057715,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0625592488,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0241845782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0766389008,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0609031154,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0554162274,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0560610954,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0586718694,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.063004049,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.068401561,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.026692969,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0365371738,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321473574,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0323206902,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0326622674,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0341440252,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0353282644,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035947883,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0268012874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0571525668,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0408178194,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0336762854,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0329964482,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342165892,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0354585132,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363023546,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0241299356,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0482047324,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360982834,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0333944808,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328452788,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0343403014,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0353402534,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03638427,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.024154069,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0470731972,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357140548,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033289989,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.033580682,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342250508,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355685798,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357372284,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0241580866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0473273178,
+                        "num_procs": 1,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0238076168,
+                        "num_procs": 1,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121865728,
+                        "num_procs": 1,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109396018,
+                        "num_procs": 1,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093816266,
+                        "num_procs": 1,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008876311,
+                        "num_procs": 1,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075928328,
+                        "num_procs": 1,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095879138,
+                        "num_procs": 2,
+                        "num_threads": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088990445,
+                        "num_procs": 4,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/mpi.json b/results/015cff6f/gemini-pro_temp0.2/mpi.json
new file mode 100644
index 0000000..21f802d
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/mpi.json
@@ -0,0 +1,44261 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029717758,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036429844,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039193195,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039855228,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00403543685,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029588408,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032892442,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017555191,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008945737,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000442565425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024655595,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014658163125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0029646662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033172972,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0035577411,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036042625,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003395996325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003609104525,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003549534025,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00358964358438,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00367844171562,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00386057794922,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00432118511172,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029651168,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003616007,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039369446,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040553933,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039522697,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029624276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031725724,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003278674,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029666282,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029503348,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016250278,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007510029,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000403340125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000233598825,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002971807,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127925662,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0463115909,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10512744335,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14436508355,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029768048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002988777,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015727625,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079208045,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004111861,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029628886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033087308,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017605778,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088391445,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004517345,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002413964,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013618524375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.468447813e-05,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029591072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032965478,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017157603,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087722395,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000459664725,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002457717125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014354570625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.700814375e-05,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0624480396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0880291534,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.083717864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0834881588,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0444831406,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022673923,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01556412405,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055705966625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00562102226875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0624142852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0837000408,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0835341956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0836531906,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441102481,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0311212381,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024575686125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055901308625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00848711438125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00160097109375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04704139556406,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0832377746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0834614268,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441881837,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02222158205,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0215221394,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057453834,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0621596348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084345917,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0440206066,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219805747,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01126383015,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005777579275,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033232853,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203420200312,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093304712031,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00796584752188,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038824376641,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0622301998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0834721428,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0619917132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083666289,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0628903128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010946272,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0621829822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0844935998,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04460835,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04123356695,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0834046958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0846692978,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0632350514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083669002,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0835422968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0835742494,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0437369348,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03043984895,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110838093,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061451062625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030882321375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00308154306875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089744709063,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02596959632422,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039845168789,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.083865032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1257823348,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.065984476,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0622445912,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0838319174,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0439699622,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02271456395,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010733031675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013691154975,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00778206758125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00232517865,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0841053402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0872633288,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0836207682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1258748278,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0657201784,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0330630096,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01641487525,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088563472875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00446961043125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025364414375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00908963440781,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085462370469,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044471177969,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0623783356,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083574162,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0623763316,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0842600498,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0441550587,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02218342345,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011708943225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00577372735,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300186745625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00385264248125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089960968125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01106321063906,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004126412293,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.374832404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3756252852,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3949964795,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39464165825,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.391818010075,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4007531910875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3746530088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3753219388,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3931852028,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39213049755,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39716558355,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3748143052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3747634574,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3960045139,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.396598926,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.393636247525,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3966249108375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3734209064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3763237788,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3940853379,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.37501163945,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.396821963775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.395514424625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39831733899375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.406508226725,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.41461057453125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.46356314349922,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.47683600396094,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3745795242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3762327326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3946550453,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39616640185,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39440741255,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39466648485,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3966179722625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3733864876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3753196826,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3929383779,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39483598195,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.378839647325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3950640796625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3945560939625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40247892344688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40882971967187,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.44361294251875,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.48036452556641,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.373047881,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3754438204,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3940066922,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39496359225,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39531365825,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3995342389375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4047649905375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4025725397625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.61809578733438,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.374286336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.375459034,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3953834881,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.391333864,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39153548345,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3942639046625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40018955355,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39728346807812,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40833900958906,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.44597507201953,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.41900167996914,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3748996386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3764584492,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3935863177,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39790507665,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.390010541675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3930170811,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3959005521,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40192105776875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40934436203125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.46549220196797,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.48954455655664,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3745048138,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3748450094,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3944112789,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3929849031,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39335914545,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3990184192,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40121617171875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40198130523125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40487991114375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3753833706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3761409262,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3933791951,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39747984765,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.38962778635,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.400024168125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3747206712,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3755328368,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.395443079,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39319609895,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3959990412,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3747741178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3755548566,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3940815305,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3944876634,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3952766061,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3952847846625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3966216323375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39957583698125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3743776822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.376262406,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3951197412,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3944443021,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.394877495275,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.397308777225,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3966020434625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40476416474688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.41728479484375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.42746005873516,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.58367715045781,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3745794852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3750275282,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3948549203,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3942458835,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3966284787,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.394915682775,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39973215119375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.40211894087813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.42315484648906,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.41949288799531,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.42053214002812,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.3752089462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3756660246,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3901053547,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3910914847,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3888241985,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "<p1, p2>",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3748665876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3760877202,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3909585897,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.38336385485,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.378763094725,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3813809604625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.37952524278125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.37930937224375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.38109482776094,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.38446110067266,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3814572277918,
+                        "num_procs": 512
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "Hull: 1.  Synag",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.084019605,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751373116,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761373072,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07582223515,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076195484375,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0813760664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795369466,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0786195537,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.077884592,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07765271515,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795963713625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794077578375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07947871717813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0792045628,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0822446576,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0840632426,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08436096665,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084122741275,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08785407465,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.088737279725,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09169907404688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09789123857969,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09389534247891,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09587460209648,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0777463482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831093328,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0835731012,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08221872405,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.081966876025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0847993055875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08371215451875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0836110409375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08356119899063,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08467228278984,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08407549661172,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0786477492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0831441326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0838985395,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08237236785,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.082624011075,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0839527683,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0841526090875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08428292740312,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08481531156875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08540740275,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0847256894957,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_t.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804649386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0805987324,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0809295586,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08078455045,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080728216025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084410542725,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08573615369375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0909425103375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09414055661094,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14348979727734,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09107781882148,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079994453,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0727448086,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0730210794,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0730586272,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.072783664675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0741929654625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07443807721875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07557915548437,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754185585625,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07553975070469,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07472759016602,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798383784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0774367036,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0778618629,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07779216975,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.077520761275,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0809131761125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07873606906875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795152711375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07949649400156,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08014844248047,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07948454057305,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0835847036,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0729391326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0845572542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0740467426,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.083231197,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0726962444,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.078723571,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1216199038,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026771166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0650521446,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002778788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009069429,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026422986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.065404994,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002695335,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0665092554,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0037887314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0639408684,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002763856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0672863172,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033004562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071701242,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027401032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109085164,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033331102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0077127902,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033579568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082044714,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027319478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0649699474,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0031820088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0647860628,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026791544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010950822,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026598658,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0654556092,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033157852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110831218,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0033935094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.064200181,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027084678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0653015046,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026535136,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0649506986,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0026572866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0641148218,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0025393062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027233456,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0048826049,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00497119415,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049465437,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0111632284375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00981988785,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01266170022812,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026648474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031269246,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046095738,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004618392,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00436597,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01375002205,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0204176828125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02381641275625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02801090759375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02869943685469,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05783916369844,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025530426,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00306606,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045297461,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050856305,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0047961943,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005571916975,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01149699019375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01286284650625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02008200871562,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03162122494141,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05214466809883,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025883086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026839672,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0048327909,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054786123,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004636182825,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121157900125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01069406981875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120951088625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01811330699063,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03409430865313,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05031279865039,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002715128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031559976,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044805516,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00466477895,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00430231635,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00598277395,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00884953898125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013955364275,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02517514360312,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03223246612109,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05049946059805,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026148768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032372708,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042767445,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046025372,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004885425775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011542037525,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01621688085625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01609907413125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02030759550937,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02860573581406,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0512280777375,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002691469,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030240108,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044314882,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00521163545,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056582887,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0079540659125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027569772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031643452,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038626379,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00332532235,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027979572,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029444232625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00493664543125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0060696160875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0025882686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026449634,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045663262,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004912353,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00468148505,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025417648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031731194,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0048160967,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004767938,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004844869575,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006110426425,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0298803431875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01308648732813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02118011570625,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02524797503203,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05741283904258,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "Degree)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0025484652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032187594,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043379731,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00514859005,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005636718325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051174354125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0073180764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028355126,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283503791,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02751829045,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02745030535,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0284296218375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02819264056875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283146559125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02835256094688,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02976412724922,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03337884522695,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0073247906,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015830463,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0155548131,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0154319432,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153092895,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01549350275,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0156428735875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015656948825,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01606815653125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01622695800859,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01771537716563,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0073024308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0287594508,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0295109799,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02968219535,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028665049225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0294586886875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02938616429375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02928030916875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02968103217344,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03137002966719,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03460305526758,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.007350532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0533482542,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0565852741,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0534849449,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0523688533,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0529474179,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05383852543125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05476643947188,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.007318598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0300258778,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293998092,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03017816615,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.029238931175,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0308356934875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327008140125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040325229675,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04071031918125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06058281330234,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08943517906406,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0072925268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158148716,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0155042286,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153414292,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153105043,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0155894127375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01559755570625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01559154197812,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159584099875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01625304265547,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01792491309766,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0073039126,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028373673,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281944944,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0275718993,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274205131,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027749985325,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028148722875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02837442386875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02848525955938,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02980624212656,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0337424907043,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0072992998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0286840246,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0290482607,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02925540375,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02921459995,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0728273702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075192622,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0750332206,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07467899995,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075019533125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0791873734375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.081192918025,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08750795361875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072770457,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0756110614,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754794581,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0756452455,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0756277371,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0790725370125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08011653971875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08181392500312,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08238235209219,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09743008245156,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10729988487227,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.072694926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076317791,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761715679,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076084795,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076549113,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08299398075,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08063563440625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08285750519688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08459076566562,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09852306910312,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08958336675742,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0728763942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0744448816,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754421614,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07513560215,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074911555275,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796914086,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07948525565,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08206071014687,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0727635254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076044649,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0766777235,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0774888535,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07687763455,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795411056375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0727784878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754377304,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0753827556,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0755135396,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075326357075,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0802108702125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.081046254675,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08254202869063,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0729750564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0750790816,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0745822384,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07498008835,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075255876325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0791056224125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079880977275,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0828351018875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08287091383594,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08809709851016,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08559169177969,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.072848828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0759258708,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0759470252,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07616451745,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076124078375,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0818505678625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0728265624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0797291092,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0745489697,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0747306466,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074764536625,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0788777338125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08055419169375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08273824815,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1527332248125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09265188459609,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08496803666172,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0730195124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075073579,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0750187423,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0748521853,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075514917225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0728585844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.075069347,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0756214833,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0746480841,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07550657875,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0827829413875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08063462488125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08063577989375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08414371714844,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09585830962656,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11511616519102,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0728528362,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0749209692,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0745557496,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07542358535,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.074830795425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08029805455,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799839443875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08370437064687,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0841506748375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14185569713672,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09355533116484,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.072986067,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0759230474,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761155336,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07615542935,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076373701675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0729862396,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076056754,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761875611,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760840474,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0763370546,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0729003952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0757650774,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0763541916,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08141678435,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076113174025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0728247266,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754561648,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751387701,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751976074,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751752945,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0761266993375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0757161039625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07616480708438,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07636092183437,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07695038531094,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076237292725,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.072793824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0752160558,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0753104819,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07522736515,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07514655225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0757287633125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0729480094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.076015664,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0756720479,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751913434,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0751248054,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0727007878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0764475752,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754054577,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0750949884,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0752067137,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0754942424,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760734055375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0760557528625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07590631394687,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07629945028047,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07633819817695,
+                        "num_procs": 512
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0260403894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222262102,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170575748,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00908668715,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00496587255,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0048469134375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00310319728125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002325578625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273255350313,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02122549187891,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00135231328086,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0261481346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022190529,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0172657726,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00931595165,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005294496125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004704174725,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273249165,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283197405,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00468148408437,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01991998557656,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203342141914,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0270965882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222475166,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178353107,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00919543695,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005171827375,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063830468,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421056069375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036093906625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0261141166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0211525268,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180064359,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00912784785,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00504577045,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115889289875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0255811654,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022143412,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0176192599,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093694289,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005297739,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00970568285,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059498378875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00556036269688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416586171875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01391756325156,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126118083867,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0264774942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221668608,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179559288,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00914434585,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053381514,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086752482,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003364255925,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272864176875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0269558128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221987024,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177039835,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00908145265,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0059524941,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.026756723,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022192561,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179627899,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089859814,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00500819265,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046747321,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00583247893125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00327608994375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01962750291875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0262661796,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221529004,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0176346056,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00912210295,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0053762474,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0264470468,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222101686,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017177226,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091875964,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00561251425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007457895875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002861937,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0255947194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022210958,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177829217,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090783972,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005140487025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.025583039,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222015862,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0173366844,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009060036,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005109046875,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0260939392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222058004,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179676535,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091946719,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005084892125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063509531625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00490119761875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005951248125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0260642892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221496544,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0173423358,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094297413,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00513400685,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00780476355,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00290002735625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027953966625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423527304844,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0253993928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0212807896,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180013651,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00966440345,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004683672775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064761070125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021119754625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066437238625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00952048891875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01983038435547,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0265211264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221907162,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0176990694,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00739222075,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003804376225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036117157875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018894846125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00409247373125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00320082172969,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00104232508047,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007707386375,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0258267368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221826886,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0175409697,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00910185655,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005262621825,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085224704625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00494241673125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00349099918437,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449913070312,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00969174059063,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00154861079062,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0262218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0221710002,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0179373476,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00913503855,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012093950775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049027975,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00237880086875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00249573546563,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00611792347187,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00466310900156,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00773148159727,
+                        "num_procs": 512
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342226472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1165016692,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342327618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.116566614,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0340284668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.116652474,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342897556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1118965562,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.034414419,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1115609114,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.034437869,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1165778532,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0343982926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1167742858,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0341837398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1167020958,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342663236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1168062826,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0347075256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1114972052,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0342628948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1166946156,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.034420556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1148570548,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0341162104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1190686618,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.034022319,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1114372456,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.033426449,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1160629982,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0341640344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.115884395,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0337068384,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1161783198,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.033681963,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1114492328,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0340851274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1163962586,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0338811608,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.116582562,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056205548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159737816,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161278965,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01619956105,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0052609186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159221982,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160062885,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01618535965,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160548091,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02616686145,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02587973900625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03478184344688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05135422642656,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05797444135,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06204224458906,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0053018956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015899744,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161089629,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161127329,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01607705895,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0228546361125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0213171357875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03077871142188,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05203310977031,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05985742038438,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05459744359648,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056511728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016059812,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0052510776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159009684,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159669997,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160083415,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015943542075,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0164158546,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01705711461875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01765766985312,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02140579231406,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02956935126406,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04236680679648,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0052602652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159550522,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159884975,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161828745,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160754608,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0290551260375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0265218012125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03734386269375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04860805772969,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06083265768125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05624691868789,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0055573134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160244794,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0051724414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015953291,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159994041,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015998789,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015956527775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01631386265,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0163474990125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01700783965,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02031679267813,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02753813014297,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056008192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160453156,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056390514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015950533,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161979029,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01754286595,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016311638325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0057074864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016007266,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131308762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106079912,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107066851,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01080384035,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105484742,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010614469,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01066742495,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01062829049688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01069840841563,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01101355036172,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01115921189414,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0131297564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107211626,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106233999,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01055181045,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010543767675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0108575926625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01056860528125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106400720625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131199438,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106902302,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106026542,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105457053,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105406006,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010765738675,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01071432601875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106501716625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01081931260937,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01077212985703,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01123242311953,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0131380276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010833266,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105762042,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105660003,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01057393205,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010907055775,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01066547976875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01062328485625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0130996954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105462424,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107347292,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106987722,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105501411,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107557054625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01064026825,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01063663938125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.013089448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106548994,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0106478283,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105371919,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010536446775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010565680925,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010627109675,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01064286155625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0071522442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089186442,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0062362022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006903541,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0101444848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109921156,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114530866,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128741368,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01077313305,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125071258875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01386151433125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01947684621562,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02652412350781,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04394469089297,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07211332072383,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.010620192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010875049,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0113252245,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01109867605,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110256716,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01320140405,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01446881225,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01905763139688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03248615465625,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04322387347734,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07020692964375,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.010250609,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107819494,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114808049,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109665352,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110902027,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0111647092375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004515502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009856478,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004928871,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002541269,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000125799825,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.4283675e-05,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.63549625e-05,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.599132812e-05,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0004411522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012652506,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "(4, -1, 2, 1) = 6.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00812469,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081494986,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081473768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.226509496,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0081676754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081493828,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.008142676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081995954,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0011523674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205041622,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "E",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00108775,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020423981,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010324496,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0203444672,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010174516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0204162766,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000983034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0448064734,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009794774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0203800726,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001038112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020399982,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010227532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020403837,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010212604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0233623922,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009802894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0203477014,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009894522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0203923356,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009785776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009930394,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009920714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010017336,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010609794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0202087608,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009801002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0233697724,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010069398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208216492,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010453682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205772208,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014656284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014788718,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001464733,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037947304,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014703136,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036561522,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014672678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037610288,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014648718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037416214,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001464936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014668374,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001476251,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147682,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014653482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014769742,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001466238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037899054,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014687114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005119895,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001465837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014750886,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014677264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037020734,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014627854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014713072,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014646432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015051694,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014645864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014704902,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014634348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001472151,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001464346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014703334,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014637158,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014686568,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0014657754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029402462,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009689054,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200264194,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010980014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020048261,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015748664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199923132,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009368652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199834784,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009807842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199841736,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0011345058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201034486,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0011972346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200904304,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0012428088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200400874,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0018470662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200789848,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0012507254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200810446,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0011632044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201081278,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0016834554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207631394,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0012825192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200673466,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0012012662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201084144,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015332604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0202868806,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013954804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200816316,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114830824,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00713106155,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007152496825,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003807469375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031340807875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00182405265313,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00345382065156,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016478378875,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00110979264922,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009155674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200222376,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013656626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0200063036,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0010704112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201424186,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0008050968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023895576,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006718516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026238434,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006621892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006672464,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006604942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029145378,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009139122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006822288,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006605596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023468718,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006636942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030185498,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009101188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006687474,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006568308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006673024,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0007780514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006608366,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0007128224,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000670326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006928688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024342718,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0008860774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025867998,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0009104254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004358732,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.000770259,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024610666,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0007060238,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006717552,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0007234866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006937466,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015316292,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5558159466,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015296972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001511407,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015202738,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015109586,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015246665,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015823099,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001513564875,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015276459125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00156096909375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00158628720312,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015174562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015168756,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015151374,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00152236035,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151347525,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001560409625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015719629875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163038116875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202932014062,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214482827813,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00347549852266,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015237402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015154668,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0015195444,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5358386768,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2742633329,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1359903108,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0675814266,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038124115525,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0188680447625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01364920541563,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001520909,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015164326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015228524,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015145521,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151146955,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001533231925,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00154475380625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016781165125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080021675,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0210924256,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0802785554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022070083,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080393308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0216836202,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.075709881,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197101482,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": ",",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.078766942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0220540414,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0762671264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218465196,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.081154634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0216423664,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0817381746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0197361646,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0810742004,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218734962,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0823705274,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057512046,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080538696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0213043088,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1095955072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0225319886,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.110013858,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0212124896,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.076652772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201869814,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0770262028,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0210005444,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_t* 0.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.005988434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065155808,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0059263574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064262924,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00590888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064129652,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0060498824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0066618524,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0065648516,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006539157,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0059524966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095271318,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0099739173,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096299788,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009448739625,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011295169475,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00958468525,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01002822902813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01010547244375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00971497567891,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00950283624141,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0063159512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078854696,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0059670954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065228432,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0059793744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065765388,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00595914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061024248,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037521692,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00180960445,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000970256925,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008920898,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000453807575,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030541297188,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00162360208437,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023299778125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0059958196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0065226472,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061446644,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062158162,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061649842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006140356,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061608668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006121899,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006185922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061915404,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061830844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062799896,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061732522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062086364,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006165784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006163796,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061682422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.006186493,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0061764532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061253704,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006171718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0063022702,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.006174461,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0061764388,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0429427348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039145127,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0428417218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039226737,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0427194454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038230335,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0426141152,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038290779,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0424683998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0390273714,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.042640871,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382191144,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0435733664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0385042962,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0426505534,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038210572,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0427184034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382887996,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0427548026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382893226,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0428673442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382012928,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0422851278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.038101119,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0424448936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0382616006,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.063703701,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0772706608,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636897264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0787348168,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636619902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0752917898,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.063579278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0785759108,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0634491468,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0753881156,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.062753282,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1372210916,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636642262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0788523098,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636888812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0787899206,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0636827914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0752985226,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0633713338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.077317399,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0637134934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0772856588,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0633102046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0786206172,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0628820206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0753115152,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002917862,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021109758,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002741267,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018625714,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030433968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018999114,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0032452418,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002047374,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002731204,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020778732,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029874034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023927234,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002776501,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019164612,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030160838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018997796,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002760345,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002087151,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.002733895,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018987076,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000968323,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005109219,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000251444125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001680186375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 7.64467875e-05,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029418394,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020178928,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0027279922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019203358,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0030909744,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022536442,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002533836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004918392,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003891717,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.90481e-05,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002859306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028638582,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002570908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002754134,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004060232,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003538366,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004286942,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002558624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000241441,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003670278,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051215305,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004632898,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007011656625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00148747781875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00091835509062,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002584612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002524458,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002646922,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027821185,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002686491,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071747955,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00103677618125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090514422813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002577888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000239015,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004449296,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003469633,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040580285,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000257618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000257414,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002854004,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030898345,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028396515,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051723305,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074266381875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106795334375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01051772976563,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0058018277375,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219007457539,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002564736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002661196,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002583424,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028190095,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000259326,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004046806625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000574948875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011813452375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01567833330937,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01014524770625,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00244349097852,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002569986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000260818,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002788409,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003001582,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026442755,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004615230125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042204136875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074671189063,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002580468,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030956614,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0035245688,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00196098645,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001028845325,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008511563125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00156231128125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00120928318437,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002569568,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002554678,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003452231,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003321359,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037157485,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004663680125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059167209375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018256235938,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000192750925,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014061787109,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00011047122227,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002568128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002620484,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002927313,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002740286,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027088415,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004141916875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006650717625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074932914375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00108495261719,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147159439063,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00428002221836,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002557884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002395,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002600715,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027038525,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000276528975,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003881295375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002455634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000522153,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005306661,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005470674,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054386015,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000623513925,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005583624625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005995856375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064314811719,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075497748125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101210257969,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002458624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002964712,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001882935,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020368935,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019919935,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013147503125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013433901875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027099113437,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00033931022656,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00031004546641,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00014419651836,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002462984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002636454,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002796476,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00019085385,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002948564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002056358,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000251168,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002306615,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002633251,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003573308125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038946899375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010856291875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000245898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002646554,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002739235,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018414795,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001646735,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00031937485,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007275347125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00022741203438,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037440746875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01454588913984,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001965141793,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002465414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002647532,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001830155,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030890985,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000157983425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009435611875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000469484775,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002489574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000262782,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002470419,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018658495,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00018317625,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002970164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033995264,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015713785,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079819725,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000412928125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0002477992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002648994,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003069018,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002028422,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000194506475,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00036333885,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078428154375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00024697415625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023686456406,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00021608959531,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002513018,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000265936,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001557162,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.60684e-05,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.685875e-05,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.43224375e-05,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.798471875e-05,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.373052813e-05,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.702435781e-05,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.104592891e-05,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.939795078e-05,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002489936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003104298,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001986167,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001715799,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000152185025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001221130975,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046991365,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040171685313,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020259867344,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020902169531,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0001665621582,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002460758,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000166506,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002035121,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002657932,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002241866,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000294928925,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064081406875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065428958125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099508815937,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151522284219,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219394726953,
+                        "num_procs": 512
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.11408e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.3078e-06,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.56436e-05,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.37805e-05,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.19875e-05,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.42616e-05,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.787988125e-05,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.363875e-05,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 9.718e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000748187,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001029918,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011913996,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001478457175,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.01012e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007412904,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010819831,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011693506,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109328225,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034662722125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024996299875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421181461562,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00622019490938,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02658941659063,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00857157424805,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.01352e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000682261,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010153896,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008943347,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009200414,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003748158075,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028900825,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464669591875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054753696125,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07538091842969,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01263368794922,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.6722e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006757008,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010749796,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111067175,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010804978,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016068859625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00184498314375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00602371777813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00591203862969,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00910275203437,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00830936325273,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.00972e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026159666,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026708163,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272376065,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027553047,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050479235875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00456679930625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00615999239687,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00482300561563,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05682275269922,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00874710725938,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.18482e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026128326,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026598326,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027190108,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002725433525,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003991036625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455194158125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052243433625,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.15178e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.5038e-06,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.48308e-05,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.488205e-05,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6844725e-05,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.91281625e-05,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 9.7406e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006806518,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012283405,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009617263,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00103653675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001710964775,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00196956488125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 1.01368e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007386774,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010399652,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101656665,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009831201,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001559748525,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029099961875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416456619375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01595119955781,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.7806e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007397654,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009290262,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011365679,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000928114625,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00410936685,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0048983081375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00454511325,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03628171846406,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01097735837656,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00880919356406,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 9.6038e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007441496,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012357716,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010526086,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00102902975,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00341688315,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00396556119375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00376255278125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00514490667344,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748725600859,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.8686e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026130912,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027227651,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00270174355,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002744282775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039060297625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042335133125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039219326875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00574593704531,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00745489005078,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008547413625,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 9.6384e-06,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019613396,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019847658,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00196959425,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020038529,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021791561125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00358911238125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0047115984125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00556520997031,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00699454168203,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00897104268633,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.18904e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026243124,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026578873,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00266535265,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003545800125,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029878706,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00385204828125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00626507689062,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0054777702,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00845535704531,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00950047193047,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00296245,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003210614,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003721397,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036117669,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004310472075,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064852154625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00849946305625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02676978824688,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029012706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032244944,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038805127,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00444159175,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0029062798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032437624,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0039640189,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00374375085,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00362283375,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0166780939625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158739083,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01368804871875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02180197417344,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03961391543125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013085144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013701526,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014205962,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142667375,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001428834425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013038676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013140788,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014308396,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00134901945,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00135729575,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014755891875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015298898625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169681658125,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00225788962187,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00349066939531,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00539256524453,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013093098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013783042,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014038288,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014224903,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014294292,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015708618125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149933320625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017362089,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00248621965156,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334390903281,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00525281047187,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001305945,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013080452,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013959953,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137248225,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00136783685,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00130435,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014157546,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001407833,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00141863325,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014153942,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015078313375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00146315419375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163110042188,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00223025632187,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334918674766,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013022882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013598952,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014261346,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014160784,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001438859425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015463316,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001653827175,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00164859844375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00233558989375,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334584047578,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052880797418,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013047712,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013102056,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013931395,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013980633,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137794895,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143349715,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165889981875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00155757385313,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00251244103906,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032987738375,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00537619525078,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013104262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013211298,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013505145,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137994365,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00135485135,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001509059125,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014357400125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00148509511563,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00245267570625,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00316630231328,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00537379475156,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013408294,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001969064,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020464727,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020257789,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002018657675,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002165082575,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00222333804375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00236422397812,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280085420156,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00398667761563,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00576595828203,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013031476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013080134,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013515856,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00135996895,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001401855775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013034448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013709016,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014462325,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014155878,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142735105,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015658688875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016042511375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018731195875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00244242995781,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00313028948984,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516544586602,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013020662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013447044,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013353458,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013602502,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013580864,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143880095,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001595603275,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00178527197813,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00255251166875,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033888471125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00539256257891,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0013025346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013796424,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014127184,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014617641,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001446123,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016225428375,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001539665125,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013042098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014068336,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001393432,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028164975,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001360812975,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016787547625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00164143784375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165196969375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00242858680625,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00339083976875,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00542708770469,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0846686794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1210315884,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0844717718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08476673,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0848177456,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084631725,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0845880634,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084550925725,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08523510856875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084820102425,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08474895135469,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08525337470625,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.084275109,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0857176998,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0847378077,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0846091916,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084628302025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084988841875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084994816725,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08517593561875,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08541503064063,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08585957753125,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0873843065125,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.084598002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0850406552,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0846480528,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08463399705,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0846295295,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0848258356625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08497493589375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.015573587,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158488872,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0286109669,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04176351485,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0930302443,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0599629775,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11539802711875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0159223432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159654282,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274860694,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04127231205,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0188979988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022042115,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0158712372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0158123582,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274225302,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04167106955,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0155546614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0305292784,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0270769798,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.042804685,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0158676484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016882717,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0273192971,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04129084115,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0156992614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0170764904,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0271016593,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0414961599,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.015875888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0229695854,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0272311756,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0423150811,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0157745174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0231307738,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282059249,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0438766564,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0157883914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205156342,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0272775627,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413086824,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0931253396,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0551964981875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0160288578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.039639249,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0200892804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0201306984,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0159043918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159977814,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.016144057,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1195837562,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0609350756,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05426063855,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0681809595,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0447033422875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04721612985,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0160278182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157078218,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0157777374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159379784,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0273153912,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0423796507,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0930624757,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.057649241225,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0159244916,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159182198,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0157356764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157858566,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027517924,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04167931025,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0994313401,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.059460675875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "_t*",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035445926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355674598,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0057208512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031177714,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035643466,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034574747,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035464886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0359766638,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056517338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0312798162,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056529596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0344643176,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0036754176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356713934,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0036097438,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357612264,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035771086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.035656135,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0036700196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355872338,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0056511084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0339095456,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016824244,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085018179,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0052875261,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041608541625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003554837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034891626,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035656492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356292746,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.003583643,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03470023,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035478778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356348028,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.005653936,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355486628,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319362218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032038513,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0174212588,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00918412765,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0371713184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0355724448,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031896197,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032057663,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320316036,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032072579,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320081896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032180129,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.032001924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321933624,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.032629162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328860904,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.031907412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321773356,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319253856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0319562612,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320036828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320688258,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320500722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320527434,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319966216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321970808,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0321221476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321617284,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0319389912,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320685168,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0320239828,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320797766,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0318984548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320488358,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0979193814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1308999088,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1339055373,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1360028418,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14120273735,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0126134334,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9845404394,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127824894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.9104602188,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.8036346674,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.8223576373,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0125535288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0312987918,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0285750142,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02913551585,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0143310548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0311165836,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0138758212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0285011758,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.012856454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0133425286,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0133840056,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01333036235,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0144712218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032159554,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0126142214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.667528579,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127479632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274889982,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.012654265,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279764392,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283251577,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279911004,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282907168,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283935392875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127400668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281270112,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0124945844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027824239,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0128324064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.027435341,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127617078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.684261638,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0127554454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0273578632,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0017817394,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018032722,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018864191,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0018195228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018634014,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001891798,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001863137,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00180624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001812121,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018064388,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00181197785,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0017838094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019131856,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020947394,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00258887755,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.001807959,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018292754,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018318996,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00183829655,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001892515425,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0798249894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127103956,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0797401276,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803959542,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079734164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0804615316,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079872767,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1276068082,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0803489998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274666808,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0802614688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1266554154,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0807387772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1287697832,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080070105,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1283421064,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0799216108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080185534,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0800969434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803318732,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0807336698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1299729236,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.079743133,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0805165108,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0796872514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.080389171,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.080225445,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1298910354,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0798140288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0803808412,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.106104154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15309105,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1161649162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1545331292,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1099649796,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.200592722,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1070878932,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0565101756,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0393334344,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10291891,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1977296978,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1093876025,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0576671193,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04275921065,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0212283677,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010772855225,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0072344228375,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449039508281,
+                        "num_procs": 128
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00384514285469,
+                        "num_procs": 256
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00394877109531,
+                        "num_procs": 512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1093078048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1490148076,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.106947588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.164556675,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0931645844,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05350356685,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037029850025,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0199934664625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010712390375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1055796858,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2063234958,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1109342186,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0561357192,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03029316495,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01872820115,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1135140492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2667780352,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1185942422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.206768528,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1111375202,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2118800882,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1152888164,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05876827605,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03262889385,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01782733835,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01296802655,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1012937794,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1390154826,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.081648667,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04466171725,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.024387302525,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150194186875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0121525374,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102421709,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.1097092736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1737437794,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.02401819,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0544823156,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0429129656,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4343321002,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0239790648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.061718405,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0262698524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0750950896,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0239239908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0239073336,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.018190894,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02563433555,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01240458665,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0125909359625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01580973461875,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01634916360313,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0259558716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0702897812,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0246827056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.061411985,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0253481986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.086181041,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0269736294,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1061680224,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0240845046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0240798838,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0181991374,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02435830455,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109787922,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01380220795,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0251034192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.025200051,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0184288549,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283650903,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0186086118,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139131408,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138870059375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017463432675,
+                        "num_procs": 64
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 128
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0240171666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0247582782,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0189031818,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016684566,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01913276275,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096055051625,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144461978375,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0262983322,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.064612669,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0264694702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0618510842,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0240237792,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0618573218,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0248702046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0152364022,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0637809942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.4355353356,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0240036774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.061911604,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0239477144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0617142146,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2391381632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2288183562,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1246687594,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06429650865,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.237729086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2589156916,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1266799058,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06415257425,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.250008713,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2523579318,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2334264484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1830430374,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.251499149,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2469419,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2346938098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1667621804,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0928797605,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0479211125,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "{",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2362442732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1854198336,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0969399735,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11601647835,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2248481702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0721618382,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0363042985,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02362893145,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2427045198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1881943896,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0934823644,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04933746005,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028436259775,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 16
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2542410714,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2190345204,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2313711366,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2524050194,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.131772708,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06977745405,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.037620715925,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223178831875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144770569,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2359792174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1660573276,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0930656573,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0518611397,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.231195132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2645515872,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1332575596,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.066598215,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03833815195,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0233835800875,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01357306035625,
+                        "num_procs": 32
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.243968407,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2434320168,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.2475763664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1392548366,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0798406711,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04522172555,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02729003915,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01691587065,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0244938512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0247270694,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0187823879,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0256144436,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 8
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0246711686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.084077288,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.024827911,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0449926822,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.032125194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.087257323,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.024812007,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0245013446,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193952594,
+                        "num_procs": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159972313,
+                        "num_procs": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017601446475,
+                        "num_procs": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.011475816225,
+                        "num_procs": 16
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": false,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0306260452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0536257316,
+                        "num_procs": 1
+                    },
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_procs": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/omp.json b/results/015cff6f/gemini-pro_temp0.2/omp.json
new file mode 100644
index 0000000..f6e510a
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/omp.json
@@ -0,0 +1,53932 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045294240117,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048757120967,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043559763581,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027485322207,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016035344452,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.538737893e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.783216864e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.788731992e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004507901147,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048574209213,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042603779584,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027502160519,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016638115048,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.6264126e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.410220474e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.740657121e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045379791409,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048221368343,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042649861425,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027415584773,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015732608736,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.345207363e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.831291735e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.806184977e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045480374247,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047194771469,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039878450334,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027296580374,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015324614942,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.152499795e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.547555e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.783889115e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045210905373,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048929285258,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042408816516,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029430612922,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015777125955,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.227152914e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.527045578e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.769248724e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004520688206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004943607375,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043111890554,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027738176286,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016119517386,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00010871849954,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.009416491e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.701597452e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004551563412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047650095075,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041619352996,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027452632785,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00017452426255,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00010588541627,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.520619452e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.172045738e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045361611992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048028342426,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004169665277,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028179083019,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015695933253,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.9392066e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.867371172e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.678742796e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045372378081,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049322452396,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004370322451,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027620773762,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016650352627,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.270869195e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.748553038e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020201839507,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045115463436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048759356141,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043048225343,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028818454593,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00015923958272,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.678620845e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.139485002e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.705155104e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045350566506,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048732142895,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043268799782,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028378050774,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00020569320768,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.534099907e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.513783544e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.738232285e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045526046306,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049310587347,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042780898511,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002751076594,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016591995955,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.614771068e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.757959396e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.692936152e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045291204005,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005003573373,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043703280389,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002743082121,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016667377204,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00013222098351,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.760753363e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.069488496e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045625101775,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048634335399,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042077358812,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027674268931,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00022658854723,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.533224463e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.977099597e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.832392395e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045300666243,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049132034183,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041899662465,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027706101537,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016658175737,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 8.848421276e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.767552018e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.750808537e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00044976044446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048136599362,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042619016021,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027563050389,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016900263727,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.748358279e-05,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.534999073e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.10752371e-05,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00045225676149,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000488685444,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043399874121,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028772950172,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00016342718154,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00010006017983,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 4.925671965e-05,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.765486181e-05,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "<",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0143133547157,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01451471131295,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01076485477388,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00626860111952,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00335537251085,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00173005573452,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088640805334,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046870838851,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01431481577456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01450840961188,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01082590557635,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00633253734559,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00341067872941,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172877945006,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088687520474,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046496316791,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01432139389217,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01453036498278,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01083327736706,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00635646022856,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00335172563791,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00175203308463,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088814701885,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047028362751,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01439051553607,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01442704629153,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01084918528795,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00624948292971,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00339047703892,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017438441515,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088466275483,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046805068851,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01429777145386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01432571820915,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01076784227043,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00625150296837,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334069505334,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00173470079899,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088602751493,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046407133341,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01425898820162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01453563291579,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01091692484915,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00630020890385,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337010417134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172341335565,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088908914477,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047523267567,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01426997315139,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01447823494673,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01083415877074,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00628028493375,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00333672966808,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172397829592,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088878665119,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047053974122,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0142877029255,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01439841296524,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01075577568263,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00626808796078,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337251480669,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172189455479,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088657215238,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047225039452,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01431028321385,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01430233400315,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01073806826025,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00624832715839,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334948189557,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00175801217556,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088937357068,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046716872603,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01434578001499,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01446308661252,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01090738493949,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00623773280531,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00335157569498,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172469988465,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089473631233,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046435017139,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01432624477893,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01436133701354,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01076578870416,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00631427876651,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00335058737546,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172203890979,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088539011776,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046992450953,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01444690003991,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207574961707,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01552156656981,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00891410782933,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00485540125519,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00245981104672,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126192271709,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065753348172,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01440348308533,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02042760122567,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01553280986845,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890542529523,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00482963696122,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00251097958535,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125701576471,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065373834223,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01432145349681,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01455542836338,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107859686017,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00622251424938,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336196310818,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172880161554,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088788866997,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046678017825,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0143670681864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01450698506087,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01080294046551,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00636216308922,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00339205358177,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172286778688,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088984090835,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046739559621,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01436209250242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01448888722807,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0108867879957,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00632406137884,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00343812163919,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017532935366,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088596548885,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046708304435,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01425650157034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01433868072927,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01067881435156,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00631591044366,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00333373360336,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172696821392,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088570043445,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046977605671,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01437862999737,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145294951275,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01081063039601,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00632534008473,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00338433086872,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00173078905791,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088580120355,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046514738351,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+            "(",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+            "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09224855899811,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09197684433311,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06963100079447,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04027865342796,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02165943589061,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01102076042444,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558904409409,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284176822752,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09216480404139,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09255802966654,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06897752918303,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04030187558383,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0216333406046,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01097704619169,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560379363596,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284257158637,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09170821476728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09170673377812,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06943201664835,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0404388371855,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02166818417609,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100330762565,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560461748391,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285903271288,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09241650532931,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0926095565781,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06928606573492,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04042027816176,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02147913668305,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100830305368,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560300499201,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286361929029,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09221560135484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09246512725949,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06942148245871,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04037367235869,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02150429543108,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01102180853486,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00559324026108,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00311284232885,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09233946464956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09261419158429,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06947236824781,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04030044898391,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02148224990815,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01102169454098,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558385867625,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285111144185,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09201812371612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09270152579993,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06916255690157,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04043374340981,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02151112388819,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01098486687988,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560931637883,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285082906485,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09209516849369,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09272886142135,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06926062963903,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040341155231,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02135079316795,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01094391159713,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558841712773,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284625068307,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09252633843571,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0925095949322,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06948566548526,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0405186271295,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02156564109027,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100232563913,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560242738575,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028480194509,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09214722458273,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09244349058717,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06930039245635,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04027867186815,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0213150376454,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109663117677,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560167189687,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285087879747,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09192021563649,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09209512677044,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06911205034703,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04054115377367,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0215364439413,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01099075004458,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560566335917,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286774691194,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09217518549412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09204929042608,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06900628712028,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04032539371401,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02135759126395,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01062765233219,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00585178826004,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304753892124,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0920216826722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0925194889307,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06954778861254,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04056500252336,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02144998535514,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100851930678,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558314491063,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285321362317,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09218799713999,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09264664053917,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06903960537165,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04018952995539,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02157590854913,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0110638577491,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558574199677,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284541528672,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09208411201835,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09289750475436,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.069306650199,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04014326073229,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02153203357011,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01103275306523,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00559669062495,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284997336566,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09234290514141,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09271266646683,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0691040366888,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04042943362147,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02173931952566,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01102551743388,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00559107363224,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284206233919,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09234141018242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09244635365903,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06944587640464,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0405292769894,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02172096930444,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109777437523,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055989023298,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284564327449,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09224069677293,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09220218993723,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06912994924933,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040613758564,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02166932392865,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100927870721,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00559604354203,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284656472504,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09249125216156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09256527721882,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06913079768419,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0404050335288,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02150161080062,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100448966026,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00559743773192,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286074597389,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+            "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+            "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08931767921895,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.144359401986,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08949036058038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10019383933395,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08955382201821,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01709460448474,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08972018510103,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10034086629748,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08923038467765,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14020254351199,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08951319530606,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09943021275103,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0892290905118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09972659870982,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.08963461536914,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14084874503314,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            ":",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00423349943012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416376385838,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292459595948,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027071505785,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00276780296117,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217941123992,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213479977101,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00344218779355,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00429894179106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00422119256109,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291081834584,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271411202848,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274999327958,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215680114925,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216404087842,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00343047361821,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00426848717034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421437695622,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00293980631977,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273877978325,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277874562889,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215497333556,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215052962303,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00430681873113,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00448003727943,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00427159648389,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00299758110195,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273199919611,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277120918036,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213718637824,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213058330119,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00347100384533,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00429163854569,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421193446964,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291666686535,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027300035581,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00276776459068,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214414298534,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00298492144793,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00339787695557,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00429684408009,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00414554141462,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00290999598801,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027257617563,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277486220002,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214554984123,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215701963753,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00342730619013,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00432369280607,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416288599372,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289887990803,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272390972823,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00275621134788,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214624349028,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021581735462,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336375441402,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00423235110939,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041534345597,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00290258470923,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272550135851,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277947634459,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021658398211,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216392222792,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00361082907766,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00423574503511,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451296102256,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449684262276,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449953749776,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044874412939,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00450438447297,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00453400686383,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449059065431,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00429217386991,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00418770518154,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289191864431,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271398741752,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277380403131,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218392368406,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215140450746,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00362733211368,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00431434530765,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00417644586414,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00293902885169,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273536983877,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277463011444,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214805044234,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215413197875,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00338949710131,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00431520044804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00415521617979,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029127875343,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027299458161,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00278591308743,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214461591095,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215598121285,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00342103466392,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042575025931,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00420114826411,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292729828507,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271924361587,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277035553008,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218303650618,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215348303318,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336193479598,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00426961090416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416847672313,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00293571650982,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272772181779,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280453991145,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213359687477,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215147286654,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00346750579774,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00429778620601,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041976775974,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292332153767,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271519422531,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277492441237,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218011643738,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215441249311,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00338593218476,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00430511813611,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00418523233384,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292744729668,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271134693176,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00276079773903,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215847212821,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216012150049,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00342394579202,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00430247727782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00422976389527,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00288438051939,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027429100126,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277064628899,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00221510231495,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216936841607,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00335274375975,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00428424850106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00417304169387,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00295206587762,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268736742437,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277533885092,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219602305442,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218629036099,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00346850119531,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00430796314031,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00424095708877,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289333891124,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271685291082,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277206692845,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218219254166,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216908026487,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00333102103323,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00412927679718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00412902273238,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281859058887,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026433667168,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272382367402,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216896608472,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213350690901,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334603507072,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00399009287357,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00413371920586,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283229798079,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00266654230654,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273865368217,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216748230159,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215195585042,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00332680102438,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00401993729174,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00409957133234,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284758862108,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00264323428273,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273580662906,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219526104629,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002165132761,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032213781029,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00412070099264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00414710268378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287401434034,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00265420675278,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00279228258878,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218792110682,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217876266688,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00327330045402,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00401461292058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00414881370962,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283449627459,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00265109948814,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271850787103,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218609198928,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021405024454,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00326416529715,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00411307234317,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00417701881379,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287111513317,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00267060156912,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272989552468,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217838156968,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217087958008,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00333302654326,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "_t>",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00420012399554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00419129580259,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284748263657,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00265456400812,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273920893669,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218545868993,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216424297541,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00327341835946,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00410019606352,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00417603272945,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286766663194,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026662318036,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274531859905,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217591095716,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214891247451,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337371546775,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00417388919741,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00417595487088,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00288706943393,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026616929099,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273708086461,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217423606664,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213819164783,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00348344761878,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00416679792106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00418065171689,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287675783038,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00266832560301,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273584648967,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218318067491,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216208696365,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003369140625,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00405589044094,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00428075809032,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285700671375,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00267317648977,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273903440684,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216225124896,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214324537665,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334743596613,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00423501636833,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423833243549,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029171237722,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00267740450799,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00275036003441,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215827990323,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213396772742,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00321407690644,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00424102302641,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00433239806443,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00288438815624,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026725076139,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274106450379,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216942429543,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214556548744,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032675312832,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00402926281095,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423122905195,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286222659051,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00265824049711,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272749718279,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217298101634,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216333139688,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032699983567,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00420486908406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00420260485262,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00288643594831,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00264678150415,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274016503245,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00217918083072,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215568672866,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00344577617943,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00413952246308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00413979068398,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284016728401,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00265624113381,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274337101728,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00220981072634,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215761847794,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00374460332096,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00405959002674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416175443679,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283620730042,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00267280675471,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273290332407,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216493252665,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213869288564,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00340706594288,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00416233707219,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00416388195008,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287369601429,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00264763664454,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273730531335,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218426454812,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218190215528,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00366863794625,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00415628980845,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00426331907511,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284635461867,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027023056522,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273496247828,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021799987182,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215870086104,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00343301482499,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00773967951536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01934999935329,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01939830575138,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01936084609479,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01936767827719,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193960102275,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01932603027672,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01947210673243,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+            "=",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08245674762875,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08253907244653,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04149597641081,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02078810390085,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01075964905322,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005234737508,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285157430917,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00312397107482,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08217712510377,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08339495323598,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04125677384436,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02111003380269,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01076206956059,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00545285008848,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00288807880133,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00325432382524,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08226508069783,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08325901348144,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.041416747123,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092622984201,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01073780599982,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00541283860803,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00290675647557,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00303464308381,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08212518952787,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08300320990384,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04141468573362,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208316758275,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01055022068322,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00549575258046,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00274150352925,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00321766156703,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08177555389702,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08274895362556,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04133278504014,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207355748862,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01070779729635,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00545504800975,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291950870305,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00325661618263,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08173109572381,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08228021953255,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04122479408979,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02073770686984,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01056593861431,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00553480833769,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280427690595,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00331280194223,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08193993903697,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08306590393186,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04159296508878,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02083916328847,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01069872379303,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00532216560096,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00294425468892,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00345393996686,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08209414295852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08262513019145,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04158773329109,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02059330306947,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010537612997,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00537928380072,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285639744252,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00327100176364,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08198929391801,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07214931193739,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03598860818893,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01786279976368,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00909967888147,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00490394458175,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00256569832563,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00301832985133,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ": 1",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08211368583143,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08308558929712,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04134155791253,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208712041378,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01061800066382,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00537951737642,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00293504185975,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00326102916151,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08190676234663,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0835298711434,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04190575554967,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02103046420962,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01069491319358,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00533628482372,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280912294984,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00319805033505,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08178833350539,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08292167037725,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04176552779973,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092892713845,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01059592124075,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00541854333133,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00295936521143,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00338533464819,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08177149742842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08321365434676,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04145168717951,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02087582089007,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01069049071521,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00535374153405,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287067592144,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033208090812,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08200612738729,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0829158719629,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04155776463449,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02091241814196,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105312647298,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00532670989633,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291603188962,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00341959968209,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0822194211185,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08325182404369,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04176569543779,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02086600866169,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107768734917,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00546544119716,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291338469833,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00315380915999,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08216379005462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08311325293034,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04131075385958,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02088322490454,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107838684693,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00551930498332,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00297348983586,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00322886034846,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08176616355777,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08281450457871,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04145141262561,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02089007776231,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01036348361522,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00543050169945,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283414721489,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00323242098093,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08166091032326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08278448246419,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04155702702701,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02091894354671,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01038615237921,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00540695972741,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029287673533,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00318074468523,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08199425712228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08329165652394,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04150945600122,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209117166698,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01052715759724,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00529514271766,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00298111010343,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00320375729352,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "smallest&",
+            "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03060380425304,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00826232917607,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00434889812022,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285664778203,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027999414131,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199331156909,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019541233778,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337195191532,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304581137374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00816559512168,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00434132814407,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00296299643815,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028083588928,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204594004899,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00195832308382,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00319498684257,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0317016100511,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828822553158,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00437586884946,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028532974422,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280195903033,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002034143731,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00198194161057,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304326340556,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03183747734874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00815231204033,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00437210574746,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280089490116,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00279182605445,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203782767057,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00197215359658,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031256005168,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03177660312504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00843609850854,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00482867918909,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00303812753409,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00284285414964,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202803462744,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201213825494,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00317914690822,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03030845634639,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00836245343089,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00438571143895,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281671900302,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00278589315712,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201009698212,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019925698638,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00315774846822,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03308180198073,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02074348181486,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01036230884492,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00529561843723,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00312250014395,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215245690197,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204433761537,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00321823433042,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03192435912788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00832166299224,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00433223266155,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00282208919525,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281415060163,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199147574604,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200049467385,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00333986133337,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0301083702594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00822003949434,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0043685734272,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00306694190949,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00282582174987,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201194155961,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00197338238358,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00591117404401,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02978197354823,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00823867674917,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00437085740268,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00308360531926,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00282297637314,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202793050557,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199370551854,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00311064552516,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03124024011195,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00819997787476,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00434690844268,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00285131968558,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00279492028058,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205128844827,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199225116521,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00357505604625,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03124002739787,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081302665174,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042895950377,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00279324408621,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281939618289,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205161143094,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199234969914,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304877348244,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03195772618055,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00817231163383,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00433943066746,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289959311485,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280233770609,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204537697136,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199305284768,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00316227898002,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03198401164263,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00814506392926,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00433542840183,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281597785652,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283262263983,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203039664775,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199137739837,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00329191517085,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03229746352881,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00825246814638,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00436236150563,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00300278682262,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281410608441,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203825999051,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199257768691,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00324225779623,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03167409021407,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00820226799697,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00430938787758,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281080137938,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00280446019024,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204348117113,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00211181435734,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00311134513468,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03180936314166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00824622884393,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00435441285372,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00281978081912,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283899474889,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202800091356,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00196634028107,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00302905626595,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03202196918428,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00688618104905,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00375848505646,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00270891599357,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00278767775744,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202955137938,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199708212167,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00309345945716,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03215029016137,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00824872348458,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00436407439411,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292214956135,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00282687228173,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021930411458,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020067486912,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00302198696882,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03217529524118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00822471845895,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00430957060307,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283654015511,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283648185432,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00207069851458,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0019889190793,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031696991995,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "&",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0407811595127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01938863229007,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01096884459257,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092360092327,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00932764783502,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00907140579075,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00837958753109,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00892463959754,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0406195089221,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01953864842653,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01098547223955,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093969065696,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00939859747887,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00914688389748,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00822273269296,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00898688267916,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04042596835643,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01936332490295,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01095431521535,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00923077967018,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00926715396345,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00899829026312,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00813940595835,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00894599575549,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04053226653486,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01937995348126,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109319511801,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00920719876885,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00932284034789,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00901327840984,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00819899775088,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00893752165139,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04089895095676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01934373583645,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01093180049211,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00925637967885,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00934133976698,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895629692823,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085853330791,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00899816956371,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04052487332374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01925945878029,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0109599718824,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092394400388,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00928092226386,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895711462945,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835657268763,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00912040807307,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04086195006967,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01944644544274,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01101026274264,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00923150703311,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00944067481905,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00904505029321,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082628281787,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890563409775,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0405536795035,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01937165688723,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01093423757702,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00921080615371,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00936957746744,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00896627586335,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831451769918,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00872349701822,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04074028078467,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01929815001786,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01097035352141,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00922828689218,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00934094320983,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895864143968,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00847019236535,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00922274105251,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04050431102514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01936430670321,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01096669752151,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00921481363475,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00931961517781,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0090281104669,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082477517426,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00834835097194,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04053653925657,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01938372086734,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01089882086962,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092306599021,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00929641537368,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00905261244625,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0083044545725,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00834417156875,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04069088771939,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01929337307811,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100013963878,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00931297894567,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00933660473675,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00897863544524,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00840959437191,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00853041149676,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04063003342599,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01949500627816,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010955119133,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00922227595001,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00928238090128,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00906242914498,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082281915471,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831147599965,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04177056215703,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01944561973214,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01942647118121,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01935525331646,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01942658554763,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193248078227,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01938699148595,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01934912595898,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04062697589397,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01935709249228,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01093887221068,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00933824982494,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00935322009027,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00904981940985,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00839202757925,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890525355935,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04060386456549,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01930042244494,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01095879953355,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00922089312226,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00932966805995,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00903969202191,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00823325179517,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00850991941988,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04068840313703,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01935571860522,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01087830010802,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00933362152427,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00931086670607,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00905780587345,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00832468438894,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00908301752061,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04060155004263,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01932502482086,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01100100539625,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00919615700841,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00930956192315,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00898324213922,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00826726611704,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091354124248,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.04063205309212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193730706349,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01091398522258,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092504946515,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00933345332742,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00905284956098,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835055503994,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00889988336712,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00837271939963,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01619282569736,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828132312745,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495018772781,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050294669345,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00463762376457,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00424333196133,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516664143652,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00829673390836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01612595804036,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00819191876799,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049353023991,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503375139087,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464749224484,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00430409815162,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00513818003237,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00836777184159,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01614620033652,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831942651421,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495765544474,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00500604324043,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00465347506106,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00424521937966,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00520756859332,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00832045022398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01615089122206,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835225619376,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495862159878,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00501671098173,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464460477233,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00428585782647,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00514037963003,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00833627078682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01614217869937,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00825858078897,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004963080585,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00501394253224,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00466149877757,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00419710688293,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051681118086,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00837026014924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01603382639587,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0082780122757,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0049412086606,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00508478730917,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004646416381,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421239323914,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0051634028554,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00833342298865,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01604887079448,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00837911888957,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00498445257545,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050332121551,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464316625148,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00428731441498,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00512753371149,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00837854780257,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01609854437411,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835283566266,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00498355068266,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050239790231,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464251134545,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423027575016,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00512506086379,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00833706185222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01613913662732,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835719350725,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495362691581,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050033243373,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00461766645312,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421149972826,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00518060401082,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00837851986289,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01612701434642,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00830902718008,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00496491361409,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502134021372,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464513991028,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00427200552076,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516000911593,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00839877016842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01611902099103,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00825262907892,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00494368746877,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503693502396,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464897863567,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00424233544618,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00515116807073,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00839809309691,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01607329063118,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00831007119268,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495272055268,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00501339305192,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00463991630822,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00427876655012,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516430847347,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00836407449096,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01606170106679,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00837200172246,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495613105595,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503208432347,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00460799001157,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423003658652,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00517214592546,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00835394561291,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01607368774712,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828284211457,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00494600199163,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502735134214,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464679524302,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00420442838222,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516075603664,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00839853361249,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01601594313979,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00826155021787,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00496314205229,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00501626394689,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00462901573628,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423300471157,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00515058301389,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00840946640819,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01611080374569,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00841644536704,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00497297290713,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00499640479684,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00466399919242,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423756353557,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00519741550088,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00836707241833,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01609043516219,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00827527977526,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00496275927871,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0050186753273,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00464415792376,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00421043578535,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00515680257231,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00839058421552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01609258111566,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835240408778,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00493321996182,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503424983472,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0046580100432,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00423519667238,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516071207821,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00836392827332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01611225586385,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00827074293047,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00497549567372,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00503856614232,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00460279230028,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00422378480434,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00521421693265,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00834185313433,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01600514501333,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00824379827827,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00495032463223,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502145811915,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00462732817978,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00569785311818,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00516283418983,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01300624851137,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01318771224469,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00662083011121,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00349043887109,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00193272512406,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011212863028,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062911994755,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042882300913,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0131170835346,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01306237243116,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01309708803892,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0132047124207,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01299874670804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01253568939865,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": ")",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01306640449911,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01309945434332,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01291813515127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01295716855675,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01306360531598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01255580969155,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01315719261765,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01313328575343,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01311347670853,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01303360257298,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0130087018013,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01309121865779,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01299403626472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01303257029504,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130185553804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130080986768,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0064965872094,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337359737605,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00197764001787,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00110433660448,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071128048003,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042620785534,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01303241979331,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01311494093388,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01311688497663,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01309398356825,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01306749563664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01308416537941,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00659037884325,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336880031973,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0018234077841,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097896233201,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061332471669,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042461454868,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01314185149968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01312501765788,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01305044349283,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01314639840275,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00664411839098,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336637608707,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00186146497726,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00117177311331,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00072151981294,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037491768599,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01305109113455,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01313986815512,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01301862578839,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01304860990494,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "(",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01318165026605,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01864813119173,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01316073071212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01855306811631,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01301139220595,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02269855849445,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01307557076216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01856765896082,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01315082684159,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01855808906257,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01306140646338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02260511405766,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01301123388112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02254385259002,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01316588632762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02269976176322,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01295912563801,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02254348881543,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01298007685691,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01846898347139,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01309428587556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01857235021889,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01297080814838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01842112559825,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0131338423118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02271802034229,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01308438163251,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02270823009312,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01293872762471,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02250320985913,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "(",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00063020400703,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00257178824395,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065348595381,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064288433641,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00066286604851,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00255403723568,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065698642284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064799934626,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "<x>",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006404850632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064416024834,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065289102495,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063101910055,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00063038598746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064412783831,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00067559964955,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00308348555118,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0006410786882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00260439347476,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00064060781151,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00257003307343,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00064428430051,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00258193910122,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065237805247,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00259266793728,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00065039563924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00256804935634,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00064255967736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00258149858564,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00064905378968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00255181696266,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00063038039953,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00310826599598,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00063975583762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065609309822,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00063031427562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00256927963346,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "ontiguous, Synag",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00940737705678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02850086651742,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "<",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00136124007404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00139114633203,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007017346099,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053769629449,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063929315656,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000862204656,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098407771438,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113425068557,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00136217586696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143064875156,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071482528001,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047834403813,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052594188601,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089047588408,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101128537208,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109858587384,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013653377071,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00138119962066,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070342402905,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037916768342,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055252369493,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081810764968,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094806216657,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109944716096,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00135849900544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142216254026,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070479121059,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041223019361,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054639410228,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008372053504,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097645781934,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106690563262,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137733183801,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151549633592,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071174558252,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049198213965,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051410403103,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085594914854,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094490032643,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00110019054264,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137469116598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00146475285292,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070873368531,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045349765569,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052235089242,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085610337555,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094665978104,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111160390079,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013695795089,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00152814425528,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076323989779,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037504378706,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053655598313,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083716586232,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095174144953,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00110501367599,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137368142605,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142117086798,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073402542621,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037636049092,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053541343659,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008287800476,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096541084349,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109736733139,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00138699039817,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00157530289143,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073013044894,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045346170664,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053863339126,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080988202244,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096979923546,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113806575537,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00138092692941,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00152576547116,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076253898442,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041120611131,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051510948688,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082582626492,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096892360598,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268227048218,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137823224068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00151493959129,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073503963649,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045470558107,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050422698259,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084132738411,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097846761346,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114291701466,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00138374436647,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00155353806913,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071820169687,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044351872057,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073806531727,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086419656873,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099043771625,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111657138914,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00138949714601,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149654485285,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00069527272135,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043506268412,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053294934332,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088431015611,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010571969673,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111852940172,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00136129371822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00157149750739,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070664994419,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049491543323,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052848625928,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084196496755,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098059512675,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113514997065,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137045942247,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149596184492,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142598841339,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140195079148,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143803842366,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001397225447,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00139537453651,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00141033269465,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137707740068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140662137419,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073558054864,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00036731939763,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053435582668,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083455462009,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093977674842,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115746650845,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00138844121248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00155763365328,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00071775857359,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044946651906,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050545558333,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081693995744,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00100975614041,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00104930847883,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00137641057372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015687180683,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00072519052774,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048515703529,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054854862392,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082787405699,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009811386466,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105948802084,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013575674966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00156933777034,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078116003424,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005051990971,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005257114768,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081498399377,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097856558859,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00107084121555,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+            "negate,  Synag",
+            "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+            "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0045105651021,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00458110384643,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227719172835,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001265341416,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073305573314,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005483198911,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0045173259452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00454779695719,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00223765727133,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00136658791453,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083938185126,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052767712623,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0045158110559,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455544348806,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227625574917,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00117332246155,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070990566164,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056315977126,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00452152229846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455779228359,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227475911379,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125108454376,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082302521914,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005405953154,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00453851576895,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455997996032,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227408520877,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00131055079401,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00066704154015,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052523035556,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00448977146298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00456479117274,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00235982518643,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137199796736,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065531358123,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057323500514,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00450485441834,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00458865761757,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227240864187,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012165537104,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00072905365378,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054053943604,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00450517758727,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00458912327886,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00226762127131,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124631319195,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064444113523,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050514433533,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00451689902693,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00453316792846,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022730153054,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00133158415556,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082061011344,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051466673613,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00449502039701,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451212655753,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00230528265238,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121656935662,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070585068315,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053313951939,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00448122564703,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449403375387,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227319411933,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125728845596,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074009653181,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053954552859,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00448614489287,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00452924054116,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00225324500352,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121195092797,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00069337505847,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00320879500359,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00451285522431,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00446652807295,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00224524568766,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137090813369,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084359757602,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050013139844,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0044764528051,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00450520887971,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00228952467442,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00119100920856,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076422840357,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058464445174,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00451591536403,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00457626897842,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00226515941322,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00139646269381,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00066788736731,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053586624563,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00447334889323,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451380964369,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00226805694401,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112650804222,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077117308974,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054571516812,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00452748909593,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00454728417099,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00226889401674,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121626909822,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075892806053,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052773319185,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00450324136764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00456098038703,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00226932521909,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114516522735,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065499283373,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046277269721,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00447811614722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451559685171,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00227412581444,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130080468953,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080987587571,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054421797395,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00451427809894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00454352702945,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00230254810303,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125390216708,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074731055647,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052377581596,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 32
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162638891488,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016816008836,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008500598371,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043575018644,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053226407617,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085792075843,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009609926492,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115679483861,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164142344147,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00166629813612,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085012186319,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047684777528,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056864041835,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008229482919,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095375515521,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00116851516068,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00161117017269,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165596250445,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083083566278,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054789334536,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057803411037,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085586309433,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094556938857,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00118075832725,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164174418896,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169552899897,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083167310804,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047872718424,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054146684706,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089680626988,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097663588822,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00116791576147,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163311790675,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00176369752735,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085496716201,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056771486998,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054685510695,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143265854567,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095887295902,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012321325019,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162000674754,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172152221203,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084035992622,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043093468994,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058950167149,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085299536586,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099961943924,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00119835343212,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162587631494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165310502052,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082900878042,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00060105919838,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057223867625,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084912031889,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096926223487,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011755881831,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016279483214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172137375921,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082251578569,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049577299505,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055268593132,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082141757011,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098979882896,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114731751382,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00161829758435,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00166416410357,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083874240518,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050316303968,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054037049413,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084681566805,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099154803902,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121587049216,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162707827985,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00164015498012,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082505885512,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00042476728559,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056548435241,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083211455494,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094368364662,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00122069139034,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162154734135,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00167530495673,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083927754313,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046716127545,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054891519248,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086363554001,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095352437347,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123816244304,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016463663429,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00166013427079,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083812735975,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046318937093,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058943349868,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084960293025,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009719889611,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121998060495,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163748972118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00176790971309,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085075683892,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00043850876391,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00050367172807,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086852833629,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093359686434,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00118486043066,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016384633258,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00173444598913,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082500092685,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061917137355,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053603891283,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080448836088,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093015637249,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00434529613703,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00160864945501,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168960951269,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082658752799,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055964328349,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052713230252,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083157476038,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095644388348,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012821348384,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016376061365,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168179757893,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086094252765,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051259491593,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056141670793,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088554862887,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099318102002,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00122157894075,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163730923086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016983481124,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083530452102,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00049075204879,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056985244155,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008460463956,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00104577802122,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00116463433951,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164728816599,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017092268914,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083592534065,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041544754058,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054266564548,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089796595275,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00102324169129,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00117450859398,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163458064198,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00171979106963,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084626469761,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052576791495,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080165471882,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00081954263151,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096176769584,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112890060991,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162572152913,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00174183603376,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085613913834,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005046999082,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052881520241,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083647016436,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010076764971,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00119631122798,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+            "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+            "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00274859722704,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02280372008681,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01142510902137,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00583494976163,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00308292247355,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00180086176842,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165342018008,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022171869874,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00274843908846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02260177973658,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01136925984174,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00571905132383,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00305694099516,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00170829501003,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163019206375,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00215416438878,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00272708851844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02276390511543,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01143313162029,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00575215984136,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00309189092368,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00176289211959,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172382388264,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021673688665,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00275109708309,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02260062787682,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01143328398466,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00578137896955,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030367391184,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169830434024,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00160884205252,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218792967498,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00275897793472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02269596103579,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01140734758228,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00590981058776,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00306124575436,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168076958507,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001485844329,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213717445731,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027425153181,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02267726268619,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01135703660548,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00569493211806,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00313791129738,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169948451221,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00154533237219,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00211758166552,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00270881410688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02258782163262,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01130449753255,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00566137265414,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00294702239335,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165077671409,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00159099046141,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00212514754385,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00271450113505,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02194037139416,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01112071946263,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00561884492636,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00296089630574,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168138034642,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00159004274756,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214461069554,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00271146334708,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02270640656352,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01149537246674,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005747464858,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304915886372,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00181891936809,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163153056055,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00210646614432,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00269645508379,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02260293997824,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01147162821144,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00578746013343,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00311218146235,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00180454030633,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0014959115535,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00223026499152,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00269116275012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02172902338207,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01090723033994,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00553941931576,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304918047041,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00162496585399,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00174422245473,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0022066893056,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00269244555384,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02270455304533,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01141399778426,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00578549057245,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00302056092769,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00175508912653,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00161241460592,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00210043918341,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00270094498992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02273892983794,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01142482627183,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00588543061167,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00294553749263,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00177760291845,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172627344728,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214163064957,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00269151777029,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02274944167584,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01143264845014,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00578941218555,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030959431082,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00172865130007,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00167206078768,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00221376065165,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00274503249675,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02200370430946,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01107001677155,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00570068527013,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00298840664327,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00169677138329,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168165899813,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00211195070297,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00275577232242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0225231198594,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01138116214424,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00588977709413,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00292813330889,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00180145651102,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00157250389457,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214292667806,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00276202261448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02191162277013,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01108631081879,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00561219640076,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00294897221029,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00178156848997,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015108473599,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214327145368,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00276355911046,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02199518978596,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01103515494615,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00569121520966,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00305006057024,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147889144719,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00152958855033,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00220597162843,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00099394097924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111231915653,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057394057512,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039850398898,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055574607104,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082074273378,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096687562764,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00352505408227,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00109916646034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00104916635901,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005519784987,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037607196718,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053323544562,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083779077977,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00098519399762,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137391015887,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100029613823,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00111274439842,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006012937054,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0004143036902,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053255241364,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084632243961,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094513297081,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00153692234308,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100261475891,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109268222004,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058288350701,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00033495854586,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055373162031,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086234286427,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095866210759,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126853901893,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100302342325,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124876219779,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059225428849,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038988199085,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053780246526,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083002634346,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009947899729,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124567430466,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00101339705288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126604419202,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057007856667,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037422645837,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053549781442,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082729849964,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099786408246,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125488005579,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010667975992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115637239069,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054795742035,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00033232755959,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054266937077,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086788982153,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093879085034,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00128904189914,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100254267454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123913399875,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055623687804,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038744304329,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055670384318,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087875816971,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099214501679,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127483718097,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100844819099,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00118173826486,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054538846016,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037499386817,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055950433016,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088595952839,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106282196939,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130735840648,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00108341109008,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114873368293,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005655227229,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00035227872431,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055561549962,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083124600351,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096786562353,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127923730761,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00109513867646,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114555768669,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052850469947,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037013646215,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053449776024,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090361274779,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094608645886,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00132104940712,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00100582297891,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121530350298,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057892296463,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038662366569,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052990932018,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000875836052,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009657073766,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126599278301,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00099052377045,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00108128283173,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101533215493,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00102014951408,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101344101131,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099359955639,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101429019123,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00102209746838,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00108819007874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00103493425995,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055035203695,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037281308323,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056073702872,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087105706334,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090703107417,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124343782663,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010017933324,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124041028321,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057939067483,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037044715136,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051650200039,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080488864332,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009836755693,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124655626714,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00107693448663,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00122336223722,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055147632957,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041574630886,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055088885128,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083130411804,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00107756592333,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127256885171,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00099225752056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113650653511,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056457854807,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037087388337,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056386515498,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084367543459,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094266552478,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124827697873,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00109370034188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123186204582,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055811218917,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00034363884479,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055419318378,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085450597107,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095977857709,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126324724406,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00109514538199,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012701606378,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055532734841,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038166996092,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00051633808762,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086463131011,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000931981951,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125949252397,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00108046382666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112565644085,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059150718153,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037193410099,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052406191826,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082314703614,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090686045587,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00122883599252,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            ":",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+            "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+            "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134562049061,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7952518414706,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39825068023056,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19750491864979,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10046047084033,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05046655703336,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0271602621302,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207517016679,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134281087667,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79741716850549,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39814755432308,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.20023628100753,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10022322032601,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05057278685272,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02729215808213,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02106234431267,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133347734809,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79482115283608,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39855826552957,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19982584007084,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10129885263741,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05054353009909,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02733550202101,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02066072244197,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134570021182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79576745145023,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3980839105323,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19916196130216,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10037911776453,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05052661616355,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02738413140178,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207844350487,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133473221213,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.796193645522,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39900246914476,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19913303293288,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10051081925631,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05043034926057,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02722816951573,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02075465135276,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133773814887,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7952016890049,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39762076288462,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19868136867881,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10043429341167,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05050639770925,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0273759579286,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02061610855162,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134073775262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79690810870379,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39669346380979,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19999737516046,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10045210234821,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05024544931948,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02740749735385,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02061135545373,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134275518358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79345450475812,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39799681473523,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19979550931603,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10044328458607,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05032579209656,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0271528525278,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02090675123036,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134378764778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79320801123977,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39651067759842,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19856425654143,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10054246280342,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05021111834794,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02733790315688,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0219256894663,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134556312114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79834331162274,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39784796256572,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1992878369987,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10085381772369,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05019550863653,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02722606491297,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208320710808,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013523535803,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5315797964111,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2656441649422,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.13265654370189,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06805511508137,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0346125241369,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01975509785116,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01757030393928,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133604649454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79432489071041,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39732312299311,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19939148630947,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10057912506163,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05018863342702,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02717159222811,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02099863253534,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133539121598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79360933452845,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39808210749179,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19808032233268,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1004860535264,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05027214437723,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02750360909849,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02086257450283,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00134696308523,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79462080392987,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39758577309549,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19864941965789,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09964131321758,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05040509924293,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02710251864046,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02079475875944,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013538101688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79675406720489,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39779854156077,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19970180038363,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10213492065668,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05038517080247,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02719467971474,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02064462620765,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133992955089,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.53064164631069,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.27280318811536,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.13370249457657,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06766658332199,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03452194742858,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02046989742666,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02233245708048,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133776813745,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79471994247288,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39729401990771,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19954848363996,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10056181233376,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05010175798088,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02728934194893,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02071802187711,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00135511681437,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.79408136662096,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3975281028077,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19942755382508,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10053225625306,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05024614296854,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02725670225918,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02061276398599,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00132605172694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.795084650442,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39888894371688,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.19995502550155,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10069431755692,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05016383845359,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02741790357977,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020830790326,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00133994761854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.81317694559693,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.39847005736083,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.20011909771711,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10026370882988,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05022769495845,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0272951811552,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02078499644995,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+            "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07432552017272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.39021373372525,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.37180301193148,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.86594071444124,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.1113728357479,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.73143759965897,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.55398747622967,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.45749644562602,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07433881331235,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.35798720177263,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.36038388703018,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.86634984053671,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.1103791417554,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.73287181723863,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.55448569990695,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.45441435948014,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ",",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10615218710154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.31193086951971,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.37276771795005,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.85332006756216,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.11743667647243,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.7275649715215,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.55356078203768,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.45514313671738,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07412092871964,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.3903048010543,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.36421448625624,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.86684588603675,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.11389633156359,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.72950983960181,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.55994623173028,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.46072372384369,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07401913348585,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 6.37318839132786,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 3.35096432100981,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.85549918860197,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.11225433275104,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.73210869859904,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.5554372549057,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.45879677105695,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+            "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+            "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+            "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<=====,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+            "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+            "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+            ".",
+            "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+            "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+            "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+            "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+            "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01348402556032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127909431234,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01366683337837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01290844231844,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01322060637176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01257089693099,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0135881414637,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01311405766755,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01338281910866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01285212431103,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01335120946169,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01306430343539,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134936535731,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01267519313842,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01367176696658,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012710772641,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01335861105472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01274474188685,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01378692127764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01271041948348,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01330205351114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127156637609,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01326375920326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01295603774488,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0134345985949,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01308532301337,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01106947716326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02310414724052,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05818121973425,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04998579546809,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10082879662514,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12529863640666,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14827340953052,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21663809921592,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01361714024097,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01294186096638,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01375314220786,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01258816923946,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01293030548841,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01248416807503,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01601468045264,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01349994521588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127228166908,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+            "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01415950730443,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01354861911386,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01430722121149,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01365794856101,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01307024918497,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01375338435173,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01384149417281,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0136396586895,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01365278735757,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01360556650907,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01334450915456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01310958508402,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01434725616127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01389760822058,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01346474327147,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01394673604518,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01427461151034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0142704455182,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01370733771473,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01314405761659,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01318642534316,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01297614574432,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01371613629162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01370701733977,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01386242359877,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01419825796038,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01351074371487,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01362610086799,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01396030653268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01344933081418,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01332189645618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01354041174054,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01303749084473,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01315767541528,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01420866511762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01435880623758,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.01404616143554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01402168888599,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04493943825364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04510019663721,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04477594625205,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04117695707828,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04461720772088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413816543296,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04483480546623,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0414297811687,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04484918639064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04126737918705,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04462905433029,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04112897440791,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04451444651932,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04140028730035,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04483395386487,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04147281143814,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04496442954987,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04139189962298,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04456746783108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04158554766327,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04481349885464,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04131127577275,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04458379261196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04130148924887,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04481115862727,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04097320735455,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04458399526775,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04118449874222,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04445321094245,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04135941229761,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04467317070812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04105713162571,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04456084631383,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04122763313353,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0447163105011,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04138911087066,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.04459107033908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04144263584167,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "(",
+            "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05510808397084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07047943603247,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05422520563006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07047689538449,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05489319264889,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07030420228839,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05478364285082,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.070423267968,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0547004006803,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07027096543461,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05475545227528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07049102596939,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.05382937658578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1525756534189,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15215039793402,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15233706291765,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1523341787979,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15215307865292,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15274430122226,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15230626501143,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0545744586736,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07040491942316,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05544241461903,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07043845336884,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05462589152157,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07032179292291,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05723098628223,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15167824905366,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05511397067457,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0703713843599,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05482811033726,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0703723680228,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05519667603076,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07025995459408,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05547583326697,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06993765383959,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05448270253837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07030423209071,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05517085511237,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07032993007451,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.05469893310219,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07069001402706,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+            "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+            "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00268247202039,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0032392680645,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00247338730842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00286876838654,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00261707678437,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287693049759,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00256553441286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00310281347483,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00255967117846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00303031243384,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02573935259134,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00296635422856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00318685341626,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00261790305376,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030575081706,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02566732075065,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00270365215838,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00304490216076,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00244812555611,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289189275354,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0023852692917,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00322434641421,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00298536624759,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00295769274235,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00287557765841,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00313345510513,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00288025233895,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00316810216755,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00302357021719,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00287189669907,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00268970578909,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283301379532,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00276623014361,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00313080176711,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00275176875293,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00312981046736,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00255785156041,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00295854359865,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02593098506331,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00307526756078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00318607464433,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00278650801629,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00308713018894,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+            "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+            "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024867206812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00116624664515,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053849741817,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00034279711545,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028254762292,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037526022643,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044453088194,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054511185735,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00025353319943,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130875483155,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024860594422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105784088373,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00052788760513,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00035690777004,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00031114351004,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037762857974,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046149455011,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053225401789,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00024836547673,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106974542141,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00024988800287,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010860402137,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053916443139,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00025171153247,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109668392688,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059300605208,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030840858817,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00026717465371,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038948711008,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00046032834798,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053742770106,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00025017913431,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106064621359,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054422002286,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00041850730777,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00030568931252,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037976261228,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047546904534,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005189165473,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024862419814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105898864567,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005451740697,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00031468030065,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028507448733,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038610268384,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045978333801,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00267745889723,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024929661304,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00109925083816,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054625198245,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029350630939,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00025749672204,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037691555917,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044689290226,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053697470576,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024923309684,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00108502171934,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006996428594,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028659105301,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00027744024992,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00037795938551,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048157442361,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218243543059,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002403806895,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00122363511473,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062425266951,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039373356849,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029659233987,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003822112456,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044797677547,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00060360021889,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002387188375,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00119231995195,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061554666609,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003616405651,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028269775212,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00038385074586,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00048834085464,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061569288373,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00023706182837,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00148069262505,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140994787216,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140498206019,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271753948182,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00322395320982,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00356007199734,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004706745781,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00023662112653,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124303791672,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006287811324,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00034420769662,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00028439462185,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040619298816,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00045195650309,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063168667257,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00037806741893,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127254463732,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065281242132,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00040043201298,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029358845204,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00039480105042,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00047746859491,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062363799661,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00037814937532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127479117364,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065364409238,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0003372149542,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00029294937849,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00036799479276,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00044104587287,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062251966447,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "<",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "&",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.46554783e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00270898398012,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00137419011444,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079645048827,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061575099826,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079876799136,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00091771129519,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00124816317111,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.671966165e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0023548796773,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011968145147,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074580982327,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063747558743,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077366419137,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095091741532,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00125670898706,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.589395106e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00254257135093,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013287646696,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089059602469,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064326431602,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079202856869,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089251417667,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012735048309,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.47620216e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00240682568401,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00132267139852,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00072278436273,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064419209957,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078929495066,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00091490373015,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00132170300931,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.635495573e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268099196255,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143840126693,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00088741611689,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00060217566788,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078766029328,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00096924640238,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00128921810538,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.419857144e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273361690342,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142349079251,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076916366816,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059476513416,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077981557697,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00090115983039,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00131358224899,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.501422375e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113154947758,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00063052847981,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054837986827,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055147986859,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076083540916,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00086546093225,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130073595792,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.552142203e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268071554601,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00143587011844,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076420363039,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00064578764141,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078089367598,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00091833248734,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130474101752,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.496206969e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268887225538,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140393637121,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079961493611,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00059437081218,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078304558992,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00093472693115,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123729053885,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.362133771e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00271997731179,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00139091387391,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080395713449,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000637373887,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080307554454,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00097256824374,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127513594925,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.421682537e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00272520054132,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00133498068899,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077018346637,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070648603141,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077268257737,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095620173961,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130295585841,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.497585326e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273624360561,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00139449052513,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094220954925,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057181827724,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077590998262,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095716360956,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00126342959702,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.452546567e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00269629061222,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149090606719,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077412258834,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061679258943,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076016429812,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00091543272138,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00129833128303,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.505222172e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00273069925606,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00145240388811,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087703056633,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00060793794692,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00079517252743,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00094809252769,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123648699373,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.491606236e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00264477599412,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00140480771661,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007518529892,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00058573670685,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075740795583,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087325535715,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121210664511,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+            "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00371935423464,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00541171301156,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00365782417357,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00319920517504,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036552330479,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00277190469205,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00263585653156,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00324710514396,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00355571173131,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00569860059768,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0035167362541,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00572201907635,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00340273845941,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00351908709854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00569394156337,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00355717279017,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00573497954756,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00356697458774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00497756786644,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00326190851629,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00289734452963,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0034204851836,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00255143214017,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00254486333579,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0031978962943,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00368796847761,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00590834356844,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00379025768489,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00519309528172,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00336849372834,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337118282914,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00350324846804,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00269739441574,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00250470396131,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00325564295053,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00347848460078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00575140994042,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00334719140083,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00355874765664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502105988562,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00315961018205,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00294214319438,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00330869909376,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00268689673394,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00246251877397,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00312272328883,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00467594191432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00623401254416,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00460396837443,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00695517156273,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00462419297546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00621823426336,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00453075077385,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00706823803484,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00424236804247,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 4
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00412176083773,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00625923387706,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00429014638066,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00378242470324,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00392135828733,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00295211542398,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00291179995984,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00337925571948,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00347863025963,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00597861595452,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+            "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00161691866815,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00211071465164,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00107918083668,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00067691616714,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055416487157,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082969386131,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00101364031434,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00131623316556,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00175229813904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00212574061006,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00167991165072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00149578116834,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163082517684,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00208470933139,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010376188904,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055239554495,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054616183043,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008577523753,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00095828119665,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00138835310936,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164130479097,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020593976602,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00104763768613,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053307935596,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053277295083,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084946267307,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099415723234,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00136319771409,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164126101881,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00168526172638,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00167722441256,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165870375931,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00165080465376,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00166410766542,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00164076145738,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00167572982609,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164101421833,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00211026966572,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00107911489904,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00070438347757,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056491903961,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082876384258,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099867600948,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00134643409401,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162027943879,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205038320273,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106949303299,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00062719229609,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057993140072,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00087082479149,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105921775103,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00127488728613,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164193976671,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218419525772,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112175699323,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00061187166721,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00054054111242,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00085914302617,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105453506112,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130803566426,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163753163069,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218359921128,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00108134876937,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057271011174,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057115517557,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080144032836,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00100679695606,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00132802333683,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00173201598227,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213079862297,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00164323039353,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0021393282339,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105651486665,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006921665743,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00057292636484,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082037393004,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00100186504424,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00135368984193,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00176220275462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00214642584324,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00162896942347,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00206633079797,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105003025383,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0005791541189,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00055026374757,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0008495086804,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00129994228482,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00130251292139,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163407959044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00216924268752,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00106424931437,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0006292425096,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00053277090192,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00083769410849,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00103011466563,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0012461649254,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00163624547422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203669518232,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00105238035321,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00065753143281,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00056669488549,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00089627671987,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00099808387458,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00141593515873,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            ": 2",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "fft, 4,",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02104937471449,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02108299750835,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092218920588,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02085626348853,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02086520623416,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02076787091792,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02094234712422,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02096000220627,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "-",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02098687700927,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02107850089669,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208678169176,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02096649669111,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02075089886785,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02076247446239,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02091193124652,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092214953154,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02090995199978,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02095345556736,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02088256459683,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02103734146804,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092637699097,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208366619423,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02073216121644,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02098268717527,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02091735582799,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02115769051015,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02069755438715,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02089214064181,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02085150051862,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02092849295586,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0210419267416,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02112306747586,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02076137457043,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02106391601264,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02095362450927,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02098164707422,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02094263210893,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02054869644344,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02079036049545,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02102394495159,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02095137014985,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02089988123626,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02107565812767,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02093792688102,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02096001841128,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02091650217772,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02102164644748,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02119192238897,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09828871991485,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09854931607842,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04930046014488,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02467194236815,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01227719150484,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00606356114149,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00303738415241,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00161614380777,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ")",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.09843717273325,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09426282458007,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04715942870826,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02349621504545,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01225909106433,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00579193010926,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00290458258241,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00147120486945,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02137436624616,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02194353677332,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01768019814044,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01766828335822,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01789394598454,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01569646038115,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01403694935143,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01119016557932,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02047564275563,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02220324017107,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01803188174963,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01784702017903,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.017767752707,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01533585004508,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01405009627342,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01139142848551,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02105980850756,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02226959671825,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01784969270229,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01767200231552,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01785617079586,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.015459212102,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01376568861306,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01518744640052,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02107141371816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02214281838387,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0178399162367,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01790784429759,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01780489254743,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0157470440492,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01412161085755,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01106294039637,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02134139630944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02243083659559,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01808047499508,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01769449505955,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01773711889982,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01528470199555,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01355348750949,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01110687069595,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02117972355336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02246921062469,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01770280841738,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01794238742441,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01777498796582,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01538975536823,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01359867881984,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01123306043446,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02081463709474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02205370124429,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01760870944709,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01786960251629,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01779382936656,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153363244608,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01409021336585,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01102213934064,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02078745570034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02220428586006,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177459737286,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01773599684238,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01778500936925,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01529036592692,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01342380326241,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01103427056223,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02141734901816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02241190895438,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01783207375556,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01774478722364,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01780739743263,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01550964880735,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01351963765919,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01181001644582,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02221259381622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02226062212139,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02231705002487,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02227670624852,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02225092221051,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02231102697551,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02222534380853,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02218185197562,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02076092343777,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02222317140549,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01774778328836,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01786759998649,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01788942553103,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01567351538688,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01366095747799,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01173541229218,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02119480408728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02215160131454,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01746466848999,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01750683784485,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01794855277985,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01548261232674,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01359134651721,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01183322314173,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02093975450844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02208202015609,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01793798301369,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01795112341642,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01774862650782,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01537489313632,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01372018307447,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01080264616758,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02094893381,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02185585405678,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01764855850488,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177437633276,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01774051748216,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0152638124302,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01374110821635,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01114352140576,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02132465839386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02239399887621,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01781098004431,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01796987261623,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01784252114594,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01534778233618,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01411248911172,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01174055952579,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02107731942087,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02199772745371,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01749049481004,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01763620115817,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01783123612404,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01514543946832,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01347478944808,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01101408619434,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02139483243227,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02238338701427,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01770461387932,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01783702708781,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01773197315633,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01527838520706,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01363529339433,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01053996756673,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02130632009357,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223602835089,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01797670945525,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01768732983619,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01786826383322,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01535300388932,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01351536586881,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01101761311293,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02066582087427,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02193213012069,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01785307954997,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01791357081383,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01789752244949,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01544445808977,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01394428312778,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01173840742558,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.10972965955734,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12965337187052,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10791173484176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14052457679063,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0716469053179,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03864681404084,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02670485191047,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01969364862889,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02104785740376,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04283771533519,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+            "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00561453830451,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03556210249662,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03561602607369,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03594580870122,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03912249729037,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03672041837126,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03733915574849,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03710422478616,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00574740841985,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03602523989975,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01757946629077,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00870022792369,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00448008552194,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00246301535517,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115001797676,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080510303378,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00579092428088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03532890602946,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01783296875656,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00902357827872,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00441343188286,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00221814438701,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011256583035,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077230129391,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0043426387012,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03546942006797,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01826125495136,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00986541230232,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00459253117442,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0024664260447,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00120529383421,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075798854232,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00578133389354,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03543546851724,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01896361112595,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00877171643078,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00448528341949,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00235302038491,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011377595365,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076416004449,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00579116102308,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03522263150662,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01786899641156,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00937575716525,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00445649083704,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00231127161533,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00120615959167,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077416468412,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00577335301787,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03558236490935,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01755942609161,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00888937562704,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502435732633,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219093915075,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0011276781559,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077080037445,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00576935615391,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03535389136523,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01754159647971,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01007606945932,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00455731675029,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00234164893627,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112436003983,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080258473754,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0057891048491,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03541216272861,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01772252675146,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00893631242216,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00469171367586,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00220122057945,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001203863509,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00080943927169,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00579708199948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04322884138674,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01772110853344,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00871912334114,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00446371026337,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219973158091,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115671064705,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076970849186,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00576880853623,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03788375202566,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01754069328308,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00953936055303,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00489740278572,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00233386084437,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00112553648651,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00074220299721,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00570927653462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03490734603256,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0181197527796,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00899225901812,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044465906918,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00236339587718,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00115309227258,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00076770503074,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00560620874166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04687688108534,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0211905984208,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01034891586751,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00517143681645,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00276422835886,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00142830573022,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00084995999932,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00578050259501,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04271446894854,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02047369480133,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01043137349188,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00522648207843,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00283197797835,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00132316034287,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00082804635167,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00577922407538,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03552244156599,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01779150087386,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00892328750342,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451911874115,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00244944598526,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00113950446248,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00075174532831,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00433515626937,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03535681869835,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177223648876,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00887451991439,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00470400266349,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218392182142,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00121340975165,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00077321343124,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00569305811077,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03789094593376,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01788830719888,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00958294123411,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00445362962782,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00218979921192,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00136546548456,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078409388661,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00577348340303,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03550163470209,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01770209688693,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895556602627,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00451168678701,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00221351571381,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00114343762398,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00078466665,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00577883534133,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03726682905108,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01918379962444,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00888276919723,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449176356196,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219067297876,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00120091792196,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00073363631964,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00577043369412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03591333758086,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01935729123652,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00896457061172,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00478180125356,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00219830479473,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00123614221811,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0007659457624,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03625917490572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03572308383882,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01902837455273,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0133728094399,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0113072944805,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01083463653922,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00858558770269,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00992554165423,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03328222185373,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03561247978359,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01915444191545,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01301617771387,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01125558055937,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00948182623833,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00863279458135,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00977097284049,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03543309029192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03632892984897,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01956986170262,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01205580662936,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01108070556074,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00955899357796,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00858986787498,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01000432167202,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03399154096842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03527732342482,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01837307959795,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120185604319,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01098117250949,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009424861148,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00852477159351,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00968067459762,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03516418058425,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03620761781931,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01975464969873,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01221435274929,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01089104842395,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00995612908155,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00848888605833,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00981641449034,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03516584951431,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03632663842291,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01961258668453,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01225579157472,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01106099821627,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00959219653159,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00858684182167,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00990495681763,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03456901796162,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03552755024284,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01898897886276,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01241605244577,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01103432830423,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00992309935391,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00861651599407,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00973301138729,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03505796045065,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03541625831276,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01899238117039,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01240571904927,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01129737142473,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00955674480647,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00854968465865,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00964923538268,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03515644781291,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0364795980975,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01933708973229,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01159889083356,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0113802049309,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00959331803024,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00871801301837,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00973707716912,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03437463641167,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03545835372061,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01915090978146,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01221668738872,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0113468343392,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00947123821825,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00808274988085,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00939559843391,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03637422751635,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03631518669426,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03632013592869,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03645015023649,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03650086373091,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03636071588844,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03634487837553,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03633219879121,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03490120898932,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03636908177286,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01978976987302,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01227873619646,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0114376494661,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00941133406013,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0087695678696,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0099800683558,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03376534003764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03523382954299,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01850876696408,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01179398838431,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01089831646532,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00934152696282,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00846718549728,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01537220999599,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03455377910286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03502897191793,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01942041311413,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01285005956888,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01104146018624,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096497008577,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00845282431692,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00977639500052,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03449251241982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03527555447072,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01878261044621,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01149850971997,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01189397200942,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00941124595702,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00851639769971,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00982101727277,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03502050191164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03533029481769,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01832641400397,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01218125112355,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01142609510571,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00945666115731,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0085092201829,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00990949738771,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03481428585947,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03554946128279,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01942788884044,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01216744091362,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01120010614395,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00930053945631,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.008499504067,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00977666303515,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03447134792805,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03557502292097,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01844390854239,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01177097707987,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01104319747537,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00986288134009,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00828529056162,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01007876973599,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03495669942349,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03538976293057,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01947490293533,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01219667177647,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01103898119181,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00945918634534,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00848336908966,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00960991363972,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03491707276553,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03626404199749,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01904430892318,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01158322356641,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01066528595984,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0099225204438,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0086267195642,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00980325955898,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10647086817771,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14377219807357,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08171447310597,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0508720099926,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0373118462041,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03001806531101,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03083390053362,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04416332263499,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10595407728106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14446585178375,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08211160451174,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05109623204917,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03743104021996,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02981183547527,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03030999302864,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04418991804123,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10627451203763,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14495449103415,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08223926238716,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05112710818648,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03767890110612,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03017839882523,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0304221779108,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04412713237107,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10632687285542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14470308795571,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0817071614787,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05102055706084,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03735770620406,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02992495838553,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03035165928304,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04435026012361,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10640165731311,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14406972620636,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08190472684801,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05085238143802,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03750198744237,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0298488272354,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03071631155908,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04613370038569,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10579492580146,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14427413661033,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08246899284422,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05102958846837,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03760411050171,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02972364928573,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03032122626901,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0510215671733,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10648455992341,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14456686917692,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08194717280567,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05094448253512,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0378043750301,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03006150685251,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03027233760804,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04272284321487,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1056836521253,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14482433795929,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08087124750018,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05101422164589,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03770500365645,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0299270119518,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03110289480537,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0449711907655,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10609235540032,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1445896403864,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08213164582849,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05052162837237,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03763978537172,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02985298652202,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03037669397891,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05251652635634,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10615095384419,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12586672417819,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06345736347139,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0336576430127,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02045360133052,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0152689633891,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01968936678022,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04527526162565,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10582993980497,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14425305873156,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0811913644895,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05085579734296,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03760044127703,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02995080072433,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03033793307841,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04372779708356,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10638222489506,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14461652692407,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08217069189996,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05110741090029,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03695183899254,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02974665686488,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03085270039737,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04441203270108,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01229757945985,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01170807797462,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01163366027176,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01164313256741,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01207020170987,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01233096458018,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01245493218303,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01312476061285,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0123807484284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01174377575517,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01175911445171,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01155739668757,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120090348646,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01205601673573,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01211657878011,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01263041254133,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01260943673551,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02841194812208,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01948772966862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.014922956191,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01276620756835,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01178304348141,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0112631611526,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01127082724124,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01225805711001,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01192185487598,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01168893221766,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01170262452215,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01203812975436,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120304170996,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01212052069604,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01259612347931,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01241418514401,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117118595168,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01172895748168,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01164561081678,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120034262538,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01202492676675,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01211051139981,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01261853612959,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "ranks",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01235891077667,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0117727637291,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01180756371468,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01168421376497,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01205794904381,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0120900452137,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01216348931193,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01266484931111,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+            "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00203875061125,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203020237386,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00204885788262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205043107271,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00207363050431,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201478209347,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00195322018117,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205436851829,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020565027371,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205479655415,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002037069574,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203223470598,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201635882258,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203343518078,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00199473593384,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213226489723,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205888450146,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020903499797,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205758996308,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204381234944,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203776918352,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205074194819,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00203663483262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00213394165039,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00196677539498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204374212772,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201519075781,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204917863011,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205483697355,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203378759325,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202097333968,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020169518888,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00204898435622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204990431666,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00204238761216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020415244624,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00195556804538,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205798745155,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202329009771,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202128775418,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204286500812,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200510341674,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200256034732,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00205860789865,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00204311907291,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200634580106,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00207204539329,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202664975077,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00195521377027,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00202542804182,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020266558975,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00208099652082,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00201733596623,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199812650681,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199722480029,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00203084014356,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00206369552761,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00204143002629,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00206696428359,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200567245483,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00203549880534,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200085546821,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00206420682371,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00199538357556,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00204920079559,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00200917366892,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00206159371883,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00209971554577,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+            "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00736994259059,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752687957138,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074926007539,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743059050292,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075214875862,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00740450937301,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744178090245,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742337275296,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00725726168603,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752719007432,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744340997189,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749334860593,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00757231265306,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073889663443,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744006186724,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742684099823,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00735077429563,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743955895305,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075371209532,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748174600303,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746884793043,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00739965792745,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742967650294,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00751189533621,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00756257064641,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 10.25623968485743,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00739427022636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00751732550561,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743928439915,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748345553875,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074299691245,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741857942194,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742897316813,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747105833143,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00738819111139,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746061298996,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752673912793,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075007442385,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00751908048987,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741873383522,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744835529476,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00750648472458,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00737231019884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749794915318,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752119477838,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752236656845,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752007253468,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00738623663783,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00745077226311,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742845870554,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.0076072588563,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 10.25709063317627,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00732548832893,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744884442538,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074652614072,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747372936457,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00745455138385,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743670165539,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743665955961,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746076945215,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00739456024021,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00739556588233,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074548156932,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748786013573,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747262947261,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0073614500463,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749553684145,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747150145471,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00739351399243,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00745908617973,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749364718795,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00753345582634,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749570690095,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741746556014,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074689604342,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748894568533,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00730909742415,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746791642159,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741474647075,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743013527244,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746803469956,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747131071985,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00735703445971,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00745918806642,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00739110764116,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074730059132,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00748448353261,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00752998907119,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00755082014948,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00740415807813,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749091170728,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741049461067,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00732283741236,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742079801857,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741619523615,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00737559329718,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743521675467,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742363911122,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00746523160487,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00735777076334,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": false,
+                "best_sequential_runtime": 0.00766056664288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 10.22151628900319,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 2
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00730399396271,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00742875467986,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00743205919862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00740087646991,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00741424746811,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00740746092051,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007431367971,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00738381426781,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00740986950696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00750723294914,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00751828737557,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00755917653441,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00749336853623,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00753245409578,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00744721982628,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00747766476125,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+            "=4",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+            "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+            "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00443370398134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044670868665,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00446459650993,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00447428468615,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044676316902,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449279528111,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00447006672621,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00449266545475,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "<*>",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+            "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07931178081781,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08812772836536,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08345611803234,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08108871858567,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.08136295489967,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07952433656901,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07973856404424,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07930890098214,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "(x)",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07888707909733,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07926540132612,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07921494822949,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07967516090721,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07915507368743,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07957113049924,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07930175121874,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07974373120815,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.07948699854314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07976113855839,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07952753733844,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07966954428703,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07945811618119,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07957663889974,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07965508326888,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792308749631,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            ":",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10495758242905,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.57062718756497,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.27772139217705,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.16505776476115,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11738933678716,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.09009854178876,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10163509342819,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12468948028982,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1072027195245,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28310308232903,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14158261511475,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07074382975698,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03575370851904,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01779580563307,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00899569597095,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00527787078172,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10708899572492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21712693739682,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10913784578443,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05466182790697,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02775379233062,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01413839850575,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007307517156,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00620611365885,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11168986465782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23646708969027,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11856161933392,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05915910322219,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02997952383012,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0152241056785,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00783517789096,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0057516688481,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10595333054662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28320440333337,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14217413403094,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07106976155192,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03590364884585,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01763098184019,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895971618593,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00526105146855,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10589716434479,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28294534180313,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14153802469373,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07132972870022,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357168732211,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0177501834929,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00915968250483,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00526550151408,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10649929642677,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28483780547976,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14269405603409,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07107179109007,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03557098452002,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0180337773636,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091155955568,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00540135540068,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10739330034703,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0980184385553,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05001166258007,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02510102596134,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01315474957228,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00667987335473,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00509624127299,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00502521749586,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11099690254778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23683749046177,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1189310349524,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05915134679526,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02979342918843,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01502488479018,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00810244865716,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00584045387805,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10681086182594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2832896977663,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14223159439862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07150093093514,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0356283808127,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01800668053329,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00905845202506,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00520897749811,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1094853354618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21865439228714,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10953789446503,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05492787025869,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02773459460586,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01394642069936,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00733879562467,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056986708194,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11108593437821,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21879548057914,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10923555959016,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05473903212696,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02771078739315,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01391487196088,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.007274104096,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00530316252261,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10535584446043,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28186491485685,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14155556373298,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07091794665903,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03580392468721,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01784231532365,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00901154913008,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00520994141698,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1104226924479,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23485687226057,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11908348612487,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05889781881124,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03001757077873,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01533990930766,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00792352985591,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00555392224342,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10710647534579,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28360114395618,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14251652415842,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07071216702461,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03525597695261,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01787160430104,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00905489902943,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00525148343295,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10670749992132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28320735860616,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14146026242524,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07074572481215,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03563808705658,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01762971244752,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00892861969769,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00523519217968,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10725539345294,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21812065411359,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10947925895452,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05489882864058,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0274790327996,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01400779057294,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00725579597056,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00579540897161,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11091389022768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23527479860932,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11849253773689,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05941817238927,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03032300546765,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01506181005388,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00835799966007,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00551737118512,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11079808268696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.218597888574,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10921422597021,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0547862611711,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02789449524134,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138352336362,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0071806056425,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005555671826,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10668031964451,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28234083820134,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14246347285807,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07120809871703,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0360800081864,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01776936184615,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00910911727697,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00542774852365,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10989490896463,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.21791214868426,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.10934471171349,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05490245930851,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02776484303176,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01407564803958,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00723259169608,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00614040996879,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02931564170867,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02176151294261,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143731802702,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01303356923163,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01290411558002,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01044697929174,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00942694563419,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890686288476,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02921298481524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02164304945618,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01443631667644,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01315251495689,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01300943177193,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01046826485544,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00951589904726,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00886247679591,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0292850619182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02177582774311,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01438649296761,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01298805717379,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01269348133355,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01039824746549,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00949567332864,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00883125849068,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02869664877653,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02167202252895,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01432879902422,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01305918153375,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130008911714,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01040270160884,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00948586240411,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00887290909886,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02919347789139,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02177228722721,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01451027002186,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01343092061579,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01314293853939,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01055831667036,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00953812059015,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00888100173324,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02924491427839,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02191075775772,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02191491369158,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02193718198687,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02188880015165,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02193889655173,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02188890818506,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02196401357651,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02926506269723,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02186462488025,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01474563851953,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01313174795359,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.012940197438,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01064459681511,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00950409248471,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00888722315431,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0295376168564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02183308731765,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01457730401307,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01298963259906,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01294051371515,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01056892685592,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00948949679732,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00882035978138,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02926467973739,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02174013853073,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01424553412944,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01304776798934,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01292489208281,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01031351219863,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00934006813914,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00889636464417,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02908456716686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02166448831558,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0149397784844,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01302034240216,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01270996984094,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01020010244101,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00953781791031,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00877391658723,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02924125362188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02171569224447,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01477211713791,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01279006097466,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01275944095105,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01019119750708,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00942757893354,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00884888712317,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0294308822602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02179254554212,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01427307780832,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01304298918694,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01268632952124,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01025081593543,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00935529023409,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00885570198298,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02947246711701,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02183821890503,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01413008868694,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01319064032286,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01285885423422,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103816119954,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00935477204621,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00888936780393,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02914570029825,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02151908259839,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01433362588286,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01320541817695,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01266359090805,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101585585624,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00931551326066,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00881228484213,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02978242635727,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02177471201867,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01428676173091,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01310405321419,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01281618401408,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01038211323321,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00938510671258,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00886310804635,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02954687699676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02176174093038,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01432453859597,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01307018827647,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01271079145372,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01039387527853,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00938337929547,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0084837699309,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02949707917869,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02201065998524,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01430693324655,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01338446103036,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01300859171897,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01058660969138,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00947559494525,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00889343954623,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02916833646595,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02185778003186,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0146086614579,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01339676249772,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130239488557,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01049521975219,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00961843580008,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00880569256842,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02942003402859,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02191524468362,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01449895091355,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01369106601924,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01300167497247,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01044131405652,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00956856757402,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00879294257611,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.02939277887344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02186888456345,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01456278301775,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01327315401286,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01278252247721,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01073977667838,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00942798517644,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00884556919336,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25476049017161,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29935029577464,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14870071318001,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07521626390517,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03766024634242,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01889094468206,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00977764558047,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0062666118145,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "_",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": false,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25880975313485,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23779655843973,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.11871429067105,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.06024519056082,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03026212882251,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01543531287462,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00778030063957,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00580157898366,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.27277222629637,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29596624858677,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14816721323878,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07457960173488,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03770211283118,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192238887772,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01023583579808,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00541126988828,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2724604761228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.18311075791717,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0932343037799,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.04580040350556,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02373109981418,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01196532640606,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00617982633412,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00598978064954,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.26940703094006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29560578130186,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15154648050666,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07373788580298,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03812203109264,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01943013072014,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01005382724106,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00638153571635,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.27321366462857,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29765411932021,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14801808837801,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07793563008308,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03758262973279,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01913843229413,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00976324602962,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00564656071365,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.27343044299632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29542518854141,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14832498822361,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07620273660868,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03880745563656,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191825715825,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01063235960901,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0055520568043,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.28308602068573,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29783999957144,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14851406980306,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07452234420925,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03713465649635,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01846036203206,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092551400885,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00528293550014,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25938312020153,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23932335805148,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12028076089919,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05975824445486,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0301641844213,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01528007276356,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0078187091276,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00558115970343,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25585272535682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.22060375735164,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12331464458257,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05873673781753,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03116723448038,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01634272448719,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00796249564737,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00583218801767,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25848845951259,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.28502520751208,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14389527998865,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07527586705983,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0381956635043,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01891770921648,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00991114079952,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00560161676258,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": false,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null,
+                        "num_threads": 1
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.27036961819977,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.30396748837084,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.15396569631994,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.07642276696861,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03934581074864,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02040318120271,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01003423631191,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00575315933675,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.25860407911241,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.23764454927295,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.12024059388787,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.05993138290942,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03033314272761,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0153548059985,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00787527114153,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00567260570824,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.28217682503164,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.29728792589158,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.14839324671775,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0744327198714,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03704970534891,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01845544949174,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00934598296881,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00544454865158,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03235461059958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0223036730662,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01460567563772,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01298860423267,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01280791200697,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01052028723061,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00969844032079,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089730149135,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03186860829592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02235818020999,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01416619475931,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01296593993902,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01273889280856,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01050033345819,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0096328953281,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00899207089096,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03242036253214,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02260833662003,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01448125764728,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01314029376954,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128304425627,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01031182520092,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00972225088626,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00897286701947,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03182287029922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02240226883441,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144235888496,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01296927593648,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01289407443255,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01051733549684,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00980922672898,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00891709644347,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03213748671114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226136084646,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01459447667003,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01293092202395,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01294164210558,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01033642888069,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00968929678202,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00904081091285,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03223254140466,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02255874779075,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01492277029902,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01319720856845,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01300825756043,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01053329594433,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00974524077028,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00989504251629,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03217312972993,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02251437604427,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01457678116858,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01310336235911,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01284342072904,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01052766107023,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00974859055132,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00892466176301,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03156036417931,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02244309969246,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01445348728448,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01303428113461,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01259885299951,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01029856950045,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00972080826759,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890219975263,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03136872779578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02211004719138,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01439312752336,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01282278392464,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01262901574373,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01029927935451,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00981180556118,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00891944635659,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03149785101414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02240478359163,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0145159650594,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01303725000471,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01254769098014,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01050531473011,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00957623291761,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00894136205316,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03183292113245,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02220997326076,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01445689164102,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01288150288165,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01257701572031,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01042020078748,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00967659726739,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091081995517,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03191531449556,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02235119249672,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01458832900971,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01293300762773,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01251185163856,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01059719789773,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00963010992855,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00895993839949,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03160057030618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0222230900079,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01472878903151,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01297285631299,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01288935020566,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01042268946767,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00987366661429,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00892907045782,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0314088512212,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02224933281541,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01443068217486,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01304741110653,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01298712939024,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01039858013391,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00972721837461,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00890038646758,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03177944067866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02240899764001,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01440407074988,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01274349298328,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01272466760129,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01034242026508,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00970678813756,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00897509716451,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03173996862024,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02235134895891,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01451396699995,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01279634684324,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01280906274915,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104879103601,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00975247435272,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00896145775914,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03211826402694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02229781616479,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01453993599862,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01292233597487,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01279672961682,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01041577830911,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00958692654967,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00891561303288,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03209224324673,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02224765755236,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0144578775391,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0128824789077,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01291048489511,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01036996822804,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00969678498805,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0088948501274,
+                        "num_threads": 64
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.03196150511503,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02209016922861,
+                        "num_threads": 1
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143439212814,
+                        "num_threads": 2
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129241829738,
+                        "num_threads": 4
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01255394611508,
+                        "num_threads": 8
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01039497889578,
+                        "num_threads": 16
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00964540075511,
+                        "num_threads": 32
+                    },
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00896487589926,
+                        "num_threads": 64
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/results/015cff6f/gemini-pro_temp0.2/serial.json b/results/015cff6f/gemini-pro_temp0.2/serial.json
new file mode 100644
index 0000000..7cab870
--- /dev/null
+++ b/results/015cff6f/gemini-pro_temp0.2/serial.json
@@ -0,0 +1,32852 @@
+[
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Kokkos::DefaultExecutionSpace>>(\n          0, points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const Kokkos::IndexType<Kokkos::DefaultExecutionSpace> i, double &minArea) {\n        for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> j = i + 1; j < points.extent(0); ++j) {\n          for (Kokkos::IndexType<Kokkos::DefaultExecutionSpace> k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      },\n      minArea);\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "0.0)",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        double area = triangleArea(points(i), points(j), points(k));\n        minArea = std::min(minArea, area);\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); j++) {\n      for (int k = j+1; k < points.extent(0); k++) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_type = Kokkos::View<double*, exec_space>;\n\n  const int num_points = points.extent(0);\n  view_type areas(\"areas\", num_points * num_points * num_points);\n\n  parallel_for(policy_type(0, num_points * num_points * num_points), KOKKOS_LAMBDA(const int i) {\n    const int i1 = i / (num_points * num_points);\n    const int i2 = (i / num_points) % num_points;\n    const int i3 = i % num_points;\n    if (i1 != i2 && i1 != i3 && i2 != i3) {\n      areas(i) = triangleArea(points(i1), points(i2), points(i3));\n    } else {\n      areas(i) = std::numeric_limits<double>::max();\n    }\n  });\n\n  return *min_element(areas.data(), areas.data() + areas.extent(0));\n}\n",
+            "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(\"smallestArea\", policy, KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i2 != i3 && i1 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      policy_type(0, points.extent(0) - 2),\n      KOKKOS_LAMBDA(const int i) {\n        for (int j = i + 1; j < points.extent(0) - 1; ++j) {\n          for (int k = j + 1; k < points.extent(0); ++k) {\n            minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n          }\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(\"smallestArea\", points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double minArea = std::numeric_limits<double>::max();\n\n  auto policy = Kokkos::RangePolicy<>(0, points.extent(0));\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      for (int k = j + 1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n\n  return minArea;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  double minArea = std::numeric_limits<double>::max();\n\n  Kokkos::parallel_for(\n      \"smallestArea\",\n      policy_type(0, points.extent(0) * points.extent(0) * points.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        const int i1 = i / (points.extent(0) * points.extent(0));\n        const int i2 = (i / points.extent(0)) % points.extent(0);\n        const int i3 = i % points.extent(0);\n        if (i1 != i2 && i1 != i3 && i2 != i3) {\n          minArea = std::min(minArea, triangleArea(points(i1), points(i2), points(i3)));\n        }\n      });\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  Kokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n    for (int j = i+1; j < points.extent(0); ++j) {\n      for (int k = j+1; k < points.extent(0); ++k) {\n        minArea = std::min(minArea, triangleArea(points(i), points(j), points(k)));\n      }\n    }\n  });\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  double total_area = 0;\n  #pragma omp parallel for reduction(+: total_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        total_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return total_area;\n}\n",
+            "\n\n  double area = 0;\n#pragma omp parallel for reduction(+:area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n  return area;\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: minArea)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0036028924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033917694
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035852494,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033099616
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035499588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033443702
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035577176,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003315325
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035785806,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038216142
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035762802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033795562
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035915952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033293096
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035296504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038223576
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003566408,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0037952426
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035444304,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038054458
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0036021912,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003751033
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035805302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003340963
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0036059582,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033461416
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035678048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033669766
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = points.size();\n  if (n < 3) {\n    return 0;\n  }\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0035862072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0036100824
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003570203,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003316698
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < points.size(); i++) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0036187382,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0033366354
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n  return minArea;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003581674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038282868
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double area = 0;\n  for (size_t i = 0; i < points.size(); ++i) {\n    area += ::Area(points[i], points[(i + 1) % points.size()], points[(i + 2) % points.size()]);\n  }\n  return area;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) minArea = area;\n    }\n  }\n\n  if (tid == 0) *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        minArea = min(minArea, a);\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = 1e100;\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      for (int k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) {\n    return;\n  }\n  double minArea = triangleArea(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  for (int i = tid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      minArea = fmin(minArea, triangleArea(points[tid], points[i], points[j]));\n    }\n  }\n  area[tid] = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n  int tid = get_global_id(0);\n  if (tid < N) {\n    area[tid] = ::Area(points[tid], points[(tid + 1) % N], points[(tid + 2) % N]);\n  }\n}\n",
+            "\n\n  Point A = points[threadIdx.x];\n  Point B = points[threadIdx.y];\n  Point C = points[threadIdx.z];\n  double t = triangleArea(A, B, C);\n  atomicMin(area, t);\n}\n",
+            "\n\n  double minArea = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of the triangle formed by itself and the next two points.\n  // The smallest area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x;\n  if (tid < N - 2) {\n    double area = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (area < minArea) {\n      minArea = area;\n    }\n  }\n\n  // Reduce the shared memory to find the smallest area.\n  __syncthreads();\n  if (tid == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  double localArea = triangleArea(points[threadIdx.x], points[threadIdx.y], points[threadIdx.z]);\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (localArea < minArea) {\n    minArea = localArea;\n    minA = points[threadIdx.x];\n    minB = points[threadIdx.y];\n    minC = points[threadIdx.z];\n  }\n\n  __syncthreads();\n\n  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the minimum\n  double minArea;\n  MPI_Reduce(&areas[0], &minArea, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the minimum area from process 0\n  if (rank == 0) {\n    return minArea;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> globalAreas;\n  MPI_Gather(localAreas.data(), localAreas.size(), MPI_DOUBLE,\n             globalAreas.data(), localAreas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallestArea = std::numeric_limits<double>::max();\n  if (myRank == 0) {\n    for (double area : globalAreas) {\n      smallestArea = std::min(smallestArea, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallestArea, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all areas to the root process\n  std::vector<double> global_areas;\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n             global_areas.data(), local_areas.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double smallest_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      smallest_area = std::min(smallest_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&smallest_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return smallest_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of points per process\n  int points_per_process = points.size() / size;\n\n  // Compute the starting and ending indices for this process\n  int start = rank * points_per_process;\n  int end = (rank + 1) * points_per_process;\n\n  // Compute the smallest area for this process\n  double min_area = std::numeric_limits<double>::max();\n  for (int i = start; i < end; ++i) {\n    for (int j = i + 1; j < end; ++j) {\n      for (int k = j + 1; k < end; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < min_area) {\n          min_area = area;\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum areas from all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN);\n\n  return global_min_area;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallest;\n  MPI_Allreduce(areas.data(), &smallest, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the minimum area triangle\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Return the minimum area\n  return minArea;\n}\n\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area from process 0\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;  // Other processes return -1 to avoid printing multiple times\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the areas to find the smallest one\n  double smallestArea;\n  MPI_Allreduce(areas.data(), &smallestArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallestArea;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each process computes the area of the smallest triangle formed by any 3 of its points\n  double min_area = std::numeric_limits<double>::max();\n  for (size_t i = my_rank; i < points.size(); i += num_procs) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        min_area = std::min(min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the minimum area across all processes\n  double global_min_area;\n  MPI_Allreduce(&min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Gather all local areas into a single vector on process 0\n  std::vector<double> global_areas;\n  if (rank == 0) {\n    global_areas.resize(local_areas.size() * size);\n  }\n  MPI_Gather(local_areas.data(), local_areas.size(), MPI_DOUBLE,\n              global_areas.data(), local_areas.size(), MPI_DOUBLE,\n              0, MPI_COMM_WORLD);\n\n  // Find the smallest area\n  double min_area = std::numeric_limits<double>::max();\n  if (rank == 0) {\n    for (double area : global_areas) {\n      min_area = std::min(min_area, area);\n    }\n  }\n\n  // Broadcast the smallest area to all processes\n  MPI_Bcast(&min_area, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  for (int i = rank; i < points.size(); i += size) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * fabs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Compute the area of the smallest triangle that can be formed by any 3 points.\n   Return the result in area.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\n__global__ void smallestArea(const Point *points, size_t N, double *area) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by itself and the next two points.\n  // The minimum area is stored in the shared memory.\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N - 2) {\n    double a = triangleArea(points[tid], points[tid + 1], points[tid + 2]);\n    if (a < minArea) {\n      minArea = a;\n    }\n  }\n  __syncthreads();\n\n  // The thread with the minimum area writes the result to the output pointer.\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) return;\n\n  double minArea = DBL_MAX;\n\n  for (int i = gid + 1; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      double area = triangleArea(points[gid], points[i], points[j]);\n      if (area < minArea) {\n        minArea = area;\n      }\n    }\n  }\n\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    for (int i = gid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[gid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = triangleArea(points[0], points[1], points[2]);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Each thread computes the area of a triangle formed by 3 points.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  // Store the result in the output array.\n  *area = minArea;\n}\n",
+            "\n\n  __shared__ double minArea;\n  __shared__ Point minA, minB, minC;\n\n  // Compute the area of all triangles\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n          minA = points[i];\n          minB = points[j];\n          minC = points[k];\n        }\n      }\n    }\n  }\n\n  // Reduce the minimum area across all threads\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *area = minArea;\n  }\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  // Return the result in area.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        double a = triangleArea(points[tid], points[i], points[j]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Compute the area of the smallest triangle that can be formed by any 3 points.\n  double minArea = DBL_MAX;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n",
+            "\n\n  // Initialize the area to a large value\n  *area = DBL_MAX;\n\n  // Loop over all possible combinations of 3 points\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        // Compute the area of the triangle formed by the 3 points\n        double triangle_area = triangleArea(points[i], points[j], points[k]);\n\n        // Update the area if the triangle area is smaller than the current minimum\n        if (triangle_area < *area) {\n          *area = triangle_area;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  *area = DBL_MAX;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      for (size_t k = j + 1; k < N; ++k) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < *area) {\n          *area = a;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  __shared__ double minArea;\n  minArea = DBL_MAX;\n  for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = j + 1; k < N; k++) {\n        double a = triangleArea(points[i], points[j], points[k]);\n        if (a < minArea) {\n          minArea = a;\n        }\n      }\n    }\n  }\n  *area = minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "12_geometry_smallest_triangle",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n  return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Example:\n\n   input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n   output: 5.5\n*/\ndouble smallestArea(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  int points_per_proc = num_points / num_procs;\n  int start_index = my_rank * points_per_proc;\n  int end_index = (my_rank + 1) * points_per_proc;\n  if (my_rank == num_procs - 1) {\n    end_index = num_points;\n  }\n\n  double local_area = 0.0;\n  for (int i = start_index; i < end_index; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        local_area += ::Area(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  double global_area = 0.0;\n  MPI_Reduce(&local_area, &global_area, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_area;\n}\n",
+            "\n\n  int n = points.size();\n  double minArea = std::numeric_limits<double>::max();\n\n  // Use OpenMP to parallelize the loop over all possible triples of points\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        if (area < minArea) {\n          minArea = area;\n        }\n      }\n    }\n  }\n\n  return minArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(&local_areas[0], &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (int i = 0; i < points.size(); i++) {\n    for (int j = i + 1; j < points.size(); j++) {\n      for (int k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Allreduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> localAreas;\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        localAreas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double globalMinArea;\n  MPI_Allreduce(localAreas.data(), &globalMinArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return globalMinArea;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (size_t i = 0; i < points.size(); ++i) {\n    for (size_t j = i + 1; j < points.size(); ++j) {\n      for (size_t k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min = std::min(local_min, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = points.size();\n  double local_min = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        local_min = std::min(local_min, area);\n      }\n    }\n  }\n\n  double global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = points.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process calculates the area of all possible triangles\n  std::vector<double> local_areas(n * (n - 1) * (n - 2) / 6);\n  int count = 0;\n  for (int i = 0; i < n; ++i) {\n    for (int j = i + 1; j < n; ++j) {\n      for (int k = j + 1; k < n; ++k) {\n        local_areas[count++] = triangleArea(points[i], points[j], points[k]);\n      }\n    }\n  }\n\n  // Reduce the local areas to find the global minimum\n  double global_min;\n  MPI_Allreduce(local_areas.data(), &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes the area of all possible triangles\n  std::vector<double> local_areas;\n  for (size_t i = rank; i < points.size(); i += size) {\n    for (size_t j = i + 1; j < points.size(); j++) {\n      for (size_t k = j + 1; k < points.size(); k++) {\n        local_areas.push_back(triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  // Reduce the local areas to find the smallest area\n  double global_min_area;\n  MPI_Reduce(local_areas.data(), &global_min_area, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the smallest area\n  if (rank == 0) {\n    return global_min_area;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n  int n = points.size();\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < n; i++) {\n    for (int j = i + 1; j < n; j++) {\n      for (int k = j + 1; k < n; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        minArea = std::min(minArea, area);\n      }\n    }\n  }\n\n  double globalMinArea;\n  MPI_Allreduce(&minArea, &globalMinArea, 1, MPI_DOUBLE, MPI_MIN);\n  return globalMinArea;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int num_points = points.size();\n  double min_area = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_area)\n  for (int i = 0; i < num_points; i++) {\n    for (int j = i + 1; j < num_points; j++) {\n      for (int k = j + 1; k < num_points; k++) {\n        double area = triangleArea(points[i], points[j], points[k]);\n        min_area = std::min(min_area, area);\n      }\n    }\n  }\n\n  return min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_min_area = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min: local_min_area)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        local_min_area = std::min(local_min_area, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  double global_min_area;\n  MPI_Allreduce(&local_min_area, &global_min_area, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n  return global_min_area;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double minArea = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: minArea)\n  for (int i = 0; i < points.size(); ++i) {\n    for (int j = i + 1; j < points.size(); ++j) {\n      for (int k = j + 1; k < points.size(); ++k) {\n        minArea = std::min(minArea, triangleArea(points[i], points[j], points[k]));\n      }\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &minArea, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n  return minArea;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(Kokkos::View<const double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda\n  auto kernel = KOKKOS_LAMBDA(const TeamPolicy<> &policy, int i) {\n    double min_dist = std::numeric_limits<double>::max();\n    for (int j = i + 1; j < x.extent(0); ++j) {\n      min_dist = std::min(min_dist, distance(x(i), x(j)));\n    }\n    return min_dist;\n  };\n\n  // Execute the kernel in parallel\n  auto min_dists = parallel_reduce(policy, kernel);\n\n  // Find the minimum distance among all pairs\n  double min_dist = std::numeric_limits<double>::max();\n  for (int i = 0; i < min_dists.extent(0); ++i) {\n    min_dist = std::min(min_dist, min_dists(i));\n  }\n\n  return min_dist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n - 1), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing policy = Kok\u043a\u043e\u0441::Range\u653f\u7b56<double, Kok\u043a\u043e\u0441::parallel_mode>;\n\tusing view_type = Kok\u043a\u043e\u0441::View<double, Kok\u043a\u043e\u0441::layout_left, policy>;\n\n\tview_type min_dist(\"min_dist\", std::numeric_limits<double>::max());\n\n\tauto lambda = KOKKOS_LAMBDA(const double x1, const double x2) {\n\t\tdouble dist = ::abs(x1 - x2);\n\t\tif (dist < min_dist()) {\n\t\t\tmin_dist() = dist;\n\t\t}\n\t};\n\n\t// Iterate over all possible un-sorted pair of elements\n\t// and find the minimum distance between them.\n\t// Note that we are iterating over all the elements in the vector\n\t// multiple times. This is not efficient.\n\t// A more efficient approach would be to sort the vector first\n\t// and then only consider the minimum distance between each element\n\t// and its immediate neighbor.\n\t// However, sorting is not supported by Kok\u043a\u043e\u0441.\n\t// So, we are using this simple approach for now.\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < x.size(); ++j) {\n\t\t\tlambda(x[i], x[j]);\n\t\t}\n\t}\n\n\treturn min_dist();\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &min_dist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist(0);\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tKok\u30b3\u30af::TaskGraph tg;\n\tKok\u30b3\u30af::Task<double> task([&](Kok\u30b3\u30af::TaskContext ctx) -> double {\n\t\tdouble min = std::abs(x[0] - x[1]);\n\t\tfor (size_t i = 0; i < x.size(); i++) {\n\t\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\t\tmin = std::min(min, distance(x[i], x[j]))\n\t\t\t}\n\t\t}\n\t\treturn min;\n\t});\n\ttg.run(task);\n\treturn task.get();\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\tusing tag_type = Kokkos::pair<int, int>;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tpolicy_type(0, x.size() * x.size()),\n\t\tKOKKOS_LAMBDA(const tag_type &idx, double &min_distance) {\n\t\t\tint i = idx.first / x.size();\n\t\t\tint j = idx.first % x.size();\n\t\t\tif (i != j) {\n\t\t\t\tmin_distance = std::min(min_distance, distance(x(i), x(j)));\n\t\t\t}\n\t\t},\n\t\tmin_distance);\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tKokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() - 1), KOKKOS_LAMBDA(const int& i, double& min_dist) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x(i), x(j)));\n\t\t}\n\t}, min_dist);\n\treturn min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in x.\n\tKokkos::View<double*> distances(\"distances\", x.size() * x.size());\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size() * x.size()), KOKKOS_LAMBDA(const int i) {\n\t\tdistances(i) = distance(x(i / x.size()), x(i % x.size()));\n\t});\n\n\t// Find the minimum distance.\n\tdouble minDistance = Kokkos::min_reduce(distances, Kokkos::Min<double>());\n\n\treturn minDistance;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> min_dist(\"min_dist\", 1);\n\tmin_dist[0] = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist[0]) {\n\t\t\t\tmin_dist[0] = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist[0];\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tdouble d = distance(x(i), x(j));\n\t\t\tif (d < min_dist(0)) {\n\t\t\t\tmin_dist(0) = d;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int &i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tconst int n = x.extent(0);\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tauto min_distance = Kokkos::parallel_reduce(\n\t\tx.extent(0), KOKKOS_LAMBDA(const int i, double min) {\n\t\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\t\tmin = std::min(min, distance(x(i), x(j)));\n\t\t\t}\n\t\t\treturn min;\n\t\t},\n\t\tstd::numeric_limits<double>::max());\n\n\treturn min_distance;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int i, double &minDist) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tminDist = std::min(minDist, distance(x(i), x(j)));\n\t\t}\n\t}, minDist);\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> min_dist = Kokkos::View<double*>(1);\n\tmin_dist(0) = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < x.extent(0); j++) {\n\t\t\tdouble dist = distance(x(i), x(j));\n\t\t\tif (dist < min_dist(0)) {\n\t\t\t\tmin_dist(0) = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tint n = x.size();\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "&",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_distance = std::abs(x[0] - x[1]);\n#pragma openmp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_distance = std::min(min_distance, distance(x[i], x[j]);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n - 1; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "serial",
+        "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0817970458,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0574972742
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = distance(x[0], x[1]);\n\tfor (size_t i = 1; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(x.begin(), x.end());\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn -1;\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble minDist = distance(x[0], x[1]);\n\tfor (int i = 1; i < x.size() - 1; ++i) {\n\t\tminDist = std::min(minDist, distance(x[i], x[i + 1]));\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size() - 1; i++) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tfor (size_t j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.082209908,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0575161438
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0820972706,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0577758742
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < x.size() - 1; ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i + 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (x.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(x.begin(), x.end());\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 1; i < x.size(); ++i) {\n\t\tmin_dist = std::min(min_dist, distance(x[i], x[i - 1]));\n\t}\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double min_dist = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_dist = std::min(min_dist, distance(x[i], x[j]));\n    }\n  }\n  return min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0824462572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0576207142
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_distance = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      min_distance = std::min(min_distance, distance(x[i], x[j]));\n    }\n  }\n  return min_distance;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0822238106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0573518966
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "hip",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tdouble minDist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tfor (int i = tid + 2; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= N) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(x[gid], x[gid + 1]);\n\tfor (int i = gid + 2; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[gid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "losest",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tdouble minDistance = DBL_MAX;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDistance;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint grid_size = gridDim.x;\n\tint i = bid * bid_size + tid;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\nfor (size_t i = 0; i < N; ++i) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint bid_size = hipBlockDim_x;\n\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = bid * bid_size + tid; i < N; i += bid_size) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = min_dist;\n}\n",
+            "\n\n\t// Compute the distance between each pair of elements in the vector x.\n\tfor (size_t i = 0; i < N; i++) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "losest",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockDim = blockDim.x;\n\tint blockIdx = blockIdx.x;\n\tint i = blockIdx * blockDim + tid;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t\ti += blockDim * gridDim.x;\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = min_dist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process finds the closest pair in its local part of the vector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j += size) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = my_rank; i < n; i += num_procs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = start + chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tminDist = std::min(minDist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<double> local_x(x.begin() + start, x.begin() + end);\n\tstd::sort(local_x.begin(), local_x.end());\n\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < local_n - 1; i++) {\n\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[i + 1]));\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\tint n = x.size();\n\tint local_n = n / num_procs;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == num_procs - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between its elements\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn globalMinDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements and the elements of the next process.\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = (rank + 1) * local_size;\n\tif (rank == size - 1) {\n\t\tend = x.size();\n\t}\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_distance = std::min(local_min_distance, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint size, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tint n = x.size();\n\tint chunk_size = n / size;\n\tint start = rank * chunk_size;\n\tint end = (rank + 1) * chunk_size;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tfor (int j = i + 1; j < x.size(); j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the distance between its elements\n\tstd::vector<double> local_distances;\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < x.size(); j += size) {\n\t\t\tlocal_distances.push_back(distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Gather all distances to the root process\n\tstd::vector<double> global_distances;\n\tMPI_Gather(local_distances.data(), local_distances.size(), MPI_DOUBLE,\n\t\tglobal_distances.data(), local_distances.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the minimum distance\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double d : global_distances) {\n\t\t\tmin_distance = std::min(min_distance, d);\n\t\t}\n\t}\n\n\t// Broadcast the minimum distance to all processes\n\tMPI_Bcast(&min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn min_distance;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ double distanceBetweenPoints(double x1, double x2) {\n\treturn fabs(x1 - x2);\n}\n\n/* Compute the distance between the closest two elements in the vector x.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\n__global__ void closestPair(const double *x, size_t N, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N) return;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i >= N) {\n\t\treturn;\n\t}\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < min_dist) {\n\t\t\tmin_dist = dist;\n\t\t}\n\t}\n\n\tdistance[tid] = min_dist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint blockId = blockIdx.x;\n\tint blockDim = blockDim.x;\n\tint gridDim = gridDim.x;\n\n\t// Each thread computes the distance between its element and the next element.\n\tif (tid < N - 1) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[tid + 1]);\n\t\tif (tid == 0 || dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n\n\t// Each block reduces the distances computed by its threads.\n\t__shared__ double blockDistance;\n\tif (tid == 0) {\n\t\tblockDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (tid < blockDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(blockDistance, x[blockId * blockDim + tid + 1]);\n\t\tif (dist < blockDistance) {\n\t\t\tblockDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = blockDistance;\n\t}\n\n\t// The block with the smallest distance broadcasts it to all other blocks.\n\t__shared__ double gridDistance;\n\tif (blockId == 0) {\n\t\tgridDistance = *distance;\n\t}\n\t__syncthreads();\n\tif (blockId < gridDim - 1) {\n\t\tdouble dist = distanceBetweenPoints(gridDistance, x[(blockId + 1) * blockDim]);\n\t\tif (dist < gridDistance) {\n\t\t\tgridDistance = dist;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (blockId == 0) {\n\t\t*distance = gridDistance;\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\t__shared__ double minDistance;\n\tminDistance = DBL_MAX;\n\n\tfor (size_t i = threadIdx.x; i < N - 1; i += blockDim.x) {\n\t\tfor (size_t j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDistance) {\n\t\t\t\tminDistance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (threadIdx.x == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all subsequent elements.\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\t// Compute the distance between the current element and all subsequent elements.\n\tfor (int i = tid + bid * stride; i < N - 1; i += stride) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[i + 1]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = min_dist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\tdistance[bid] = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x * gridDim.x;\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tdouble minDist = DBL_MAX;\n\twhile (i < N) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\ti += blockDim.x * gridDim.x;\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\t\tif (i == tid + 1 || dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= N) {\n\t\treturn;\n\t}\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < N; j++) {\n\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tdistance[tid] = minDist;\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i < N - 1) {\n\t\tdistance[i] = distance(x[i], x[i + 1]);\n\t}\n}\n",
+            "\n\n\t// Each thread computes the distance between its element and all elements to its right.\n\tint tid = threadIdx.x;\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tdouble dist = distanceBetweenPoints(x[tid], x[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint stride = blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\n\tfor (int i = tid; i < N; i += stride) {\n\t\tfor (int j = i + 1; j < N; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "14_geometry_closest_pair_1d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\ndouble distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example: \n\n   input: [7, 3, 9, 12, 31, 1]\n   output: 2\n*/\ndouble closestPair(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint size;\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = my_rank * local_n;\n\tint end = (my_rank + 1) * local_n;\n\tif (my_rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min : local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the minimum distance between any two elements in its local subvector.\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all processes to find the global minimum distance.\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, num_procs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n\t// Each process computes the distance between all pairs of elements in its local subvector.\n\tstd::vector<double> local_min_distances(n / num_procs);\n\t#pragma omp parallel for\n\tfor (int i = my_rank * (n / num_procs); i < (my_rank + 1) * (n / num_procs); i++) {\n\t\tfor (int j = i + 1; j < (my_rank + 1) * (n / num_procs); j++) {\n\t\t\tlocal_min_distances[i - my_rank * (n / num_procs)] = std::min(local_min_distances[i - my_rank * (n / num_procs)], distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Each process sends its local minimum distance to the root process.\n\tstd::vector<double> global_min_distances(num_procs);\n\tMPI_Gather(local_min_distances.data(), n / num_procs, MPI_DOUBLE, global_min_distances.data(), n / num_procs, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// The root process finds the minimum distance among all the local minimum distances.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (my_rank == 0) {\n\t\tfor (int i = 0; i < num_procs; i++) {\n\t\t\tglobal_min_distance = std::min(global_min_distance, global_min_distances[i]);\n\t\t}\n\t}\n\n\t// Return the global minimum distance.\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tint n = x.size();\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_dist = std::min(min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\t// Get the number of processes and the rank of the current process\n\tint num_procs, rank;\n\tMPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\t// Divide the vector into chunks and assign each chunk to a process\n\tint chunk_size = x.size() / num_procs;\n\tstd::vector<double> local_x(x.begin() + rank * chunk_size, x.begin() + (rank + 1) * chunk_size);\n\n\t// Find the closest pair in the local chunk\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (int i = 0; i < local_x.size(); i++) {\n\t\tfor (int j = i + 1; j < local_x.size(); j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(local_x[i], local_x[j]));\n\t\t}\n\t}\n\n\t// Gather the local minimum distances from all processes\n\tstd::vector<double> global_min_dists(num_procs);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, global_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the smallest global minimum distance\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : global_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Broadcast the global minimum distance to all processes\n\tMPI_Bcast(&global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tif (n <= 1) {\n\t\treturn -1;\n\t}\n\n\t// Sort the vector in ascending order\n\tstd::sort(x.begin(), x.end());\n\n\t// Find the closest pair in each half of the vector\n\tdouble minDist = distance(x[0], x[1]);\n\t#pragma omp parallel\n\t{\n\t\tint tid = omp_get_thread_num();\n\t\tint numThreads = omp_get_num_threads();\n\t\tint start = tid * n / numThreads;\n\t\tint end = (tid + 1) * n / numThreads;\n\t\tdouble localMinDist = minDist;\n\t\tfor (int i = start; i < end; i++) {\n\t\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\t\tif (dist < localMinDist) {\n\t\t\t\t\tlocalMinDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t#pragma omp critical\n\t\t{\n\t\t\tif (localMinDist < minDist) {\n\t\t\t\tminDist = localMinDist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint my_rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = x.size();\n\tint my_rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\t// Each process computes the closest pair in its local subvector.\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = my_rank; i < n; i += nprocs) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum.\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t// Each process computes the closest pair in its local subvector\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (int i = rank; i < n; i += size) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all processes\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tint n_local = n / omp_get_num_threads();\n\tint tid = omp_get_thread_num();\n\tdouble min_local = std::numeric_limits<double>::max();\n\t\n\t#pragma omp parallel reduction(min:min_local)\n\tfor (int i = tid * n_local; i < (tid + 1) * n_local; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tmin_local = std::min(min_local, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble min_global;\n\tMPI_Allreduce(&min_local, &min_global, 1, MPI_DOUBLE, MPI_MIN);\n\treturn min_global;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint n = x.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Use OpenMP to parallelize the computation of distances between pairs of elements\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(x[i], x[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tdouble local_min = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min)\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min = std::min(local_min, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\tdouble global_min;\n\tMPI_Allreduce(&local_min, &global_min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\treturn global_min;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each process computes the closest pair in its local subvector.\n\tint local_size = x.size() / size;\n\tint start = rank * local_size;\n\tint end = start + local_size;\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (int i = start; i < end; i++) {\n\t\tfor (int j = i + 1; j < end; j++) {\n\t\t\tlocal_min_dist = std::min(local_min_dist, distance(x[i], x[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Reduce(&local_min_dist, 1, MPI_DOUBLE, &global_min_dist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  double min_dist = std::numeric_limits<double>::max();\n\n  // Compute the distance between each pair of points\n  parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int& i) {\n    for (int j = i + 1; j < points.extent(0); ++j) {\n      double dist = distance(points(i), points(j));\n      if (dist < min_dist) {\n        min_dist = dist;\n      }\n    }\n  });\n\n  return min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*> distances(\"Distances\", points.size());\n\tKokkos::parallel_for(\"Compute distances\", points.size(), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdistances[i] = std::min(distances[i], distance(points[i], points[j]));\n\t\t}\n\t});\n\treturn *Kokkos::min_element(distances);\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(\"Closest Pair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tparallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const int j) {\n\t\t\tif (i != j) {\n\t\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\t\tif (dist < minDist) {\n\t\t\t\t\tminDist = dist;\n\t\t\t\t}\n\t\t\t}\n\t\t});\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tminDist = std::min(minDist, dist);\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n\tusing TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(\"closestPair\", points.extent(0), KOKKOS_LAMBDA(const TeamMember& team) {\n\t\tfor (int i = team.league_rank(); i < points.extent(0); i += team.league_size()) {\n\t\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t\tminDist = std::min(minDist, distance(points(i), points(j)));\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing namespace std;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = numeric_limits<double>::max();\n\n\t// Create a team policy for parallel execution\n\tTeamPolicy policy(Kokkos::DefaultExecutionSpace(), points.extent(0));\n\n\t// Define the kernel function to compute the distance between each pair of points\n\tKokkos::parallel_for(policy, KOKKOS_LAMBDA(const int& i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::RangePolicy<int>;\n\tusing view_type = Kokkos::View<double*, Kokkos::HostSpace>;\n\n\tconst int num_points = points.extent(0);\n\tview_type distances(\"Distances\", num_points * num_points);\n\n\tKokkos::parallel_for(policy_type(0, num_points * num_points), KOKKOS_LAMBDA(const int i) {\n\t\tconst int p1_idx = i / num_points;\n\t\tconst int p2_idx = i % num_points;\n\t\tif (p1_idx != p2_idx) {\n\t\t\tdistances(i) = distance(points(p1_idx), points(p2_idx));\n\t\t}\n\t\telse {\n\t\t\tdistances(i) = std::numeric_limits<double>::max();\n\t\t}\n\t});\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < num_points * num_points; i++) {\n\t\tmin_distance = std::min(min_distance, distances(i));\n\t}\n\n\treturn min_distance;\n}\n",
+            "\n\n\tKok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(points.begin(), points.end()), 0.0,\n\t\t[](double d, const Point* p) {\n\t\t\treturn d + distance(*p, {0, 0});\n\t\t},\n\t\t[](double a, double b) { return a + b; });\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Initialize the minimum distance to a large value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points(i), points(j));\n\n\t\t\t// Update the minimum distance if necessary\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all points\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\t// Iterate over all points after i\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\t// Calculate the distance between the two points\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\t// Check if the distance is less than the current minimum distance\n\t\t\tif (dist < minDist) {\n\t\t\t\t// Update the minimum distance\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Return the minimum distance\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tmin_dist = std::min(min_dist, dist);\n\t\t}\n\t});\n\treturn min_dist;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\tKokkos::parallel_for(points.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); ++j) {\n\t\t\tdouble dist = distance(points(i), points(j));\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t});\n\n\treturn minDist;\n}\n",
+            "\n\n\tKokkos::View<double*, Kokkos::HostSpace> distances(\"Distances\", points.extent(0));\n\tKokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int i) {\n\t\tfor (int j = i + 1; j < points.extent(0); j++) {\n\t\t\tdistances(i) = std::min(distances(i), distance(points(i), points(j)));\n\t\t}\n\t});\n\treturn Kokkos::min_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), distances, distances(0));\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tint n = points.size();\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn min_dist;\n}\n",
+            "\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n#pragma omp parallel for reduction(min:min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tmin_distance = std::min(min_distance, dist);\n\t\t}\n\t}\n\treturn min_distance;\n}\n",
+            "\n\n\t// Check if the vector is empty or has only one point\n\tif (points.empty() || points.size() == 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\t// Sort the points by x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Initialize the minimum distance to the maximum possible value\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+            "\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 2) {\n\t\treturn -1;\n\t}\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.3668170844,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.3670072204
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size() - 1; i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn min_dist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 2) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\n\tstd::sort(points.begin(), points.end(), compareX);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tauto compareX = [](Point const& p1, Point const& p2) { return p1.x < p2.x; };\n\tauto compareY = [](Point const& p1, Point const& p2) { return p1.y < p2.y; };\n\tstd::sort(points.begin(), points.end(), compareX);\n\tdouble minDist = distance(points[0], points[1]);\n\tstd::sort(points.begin(), points.end(), compareY);\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tif (points[j].y - points[i].y >= minDist) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn std::numeric_limits<double>::max();\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size() - 1; ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 1) {\n\t\treturn 0.0;\n\t}\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble minDist = distance(points[0], points[1]);\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\treturn minDist;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < num) {\n\t\tdistance[i] = distance(points[i], points[(i + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < num) {\n\t\tdistance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = DBL_MAX;\n\n\t// Loop over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) return;\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid+1)%numPoints]);\n\tfor (int i = tid+1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\tatomicMin(distance, minDist);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid < numPoints) {\n\t\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = gid; i < numPoints; i += hipBlockDim_x) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint idx = hipGetGlobalIndex();\n\tif (idx < num) {\n\t  distance[idx] = distance(points[idx], points[(idx + 1) % num]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Initialize the distance to a large value\n\t*distance = std::numeric_limits<double>::max();\n\n\t// Iterate over all pairs of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\t// Compute the distance between the two points\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\n\t\t\t// Update the distance if it is smaller than the current minimum\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "(1.21,12.12.1222.12.12.",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint gid = bid * hipBlockDim_x + tid;\n\n\tif (gid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = distanceBetweenPoints(points[gid], points[(gid + 1) % numPoints]);\n\n\tfor (int i = gid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[gid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Compute the distance between each pair of points\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tdouble minDist = distanceBetweenPoints(points[tid], points[(tid + 1) % numPoints]);\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\t__shared__ double minDist;\n\t__shared__ Point p1, p2;\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tp1 = points[tid];\n\t\t\t\tp2 = points[i];\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local copy of points.\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance.\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0.\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the closest pair of points on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_distance) {\n\t\t\t\tmin_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_distance;\n\tMPI_Allreduce(&min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tmin_dist = std::min(min_dist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t}\n\n\treturn -1;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&min_dist, &min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_distance;\n\tMPI_Allreduce(&local_min_distance, &global_min_distance, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_distance;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points\n\tstd::vector<std::pair<Point, Point>> closestPairs;\n\tfor (size_t i = rank; i < points.size(); i += size) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tclosestPairs.push_back({points[i], points[j]});\n\t\t}\n\t}\n\n\t// Each rank sends its closest pairs to rank 0\n\tstd::vector<std::pair<Point, Point>> allClosestPairs;\n\tMPI_Gather(&closestPairs, closestPairs.size(), MPI_DOUBLE, &allClosestPairs, allClosestPairs.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the closest pair of points among all the closest pairs\n\tdouble minDist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (auto const& pair : allClosestPairs) {\n\t\t\tdouble dist = distance(pair.first, pair.second);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Rank 0 broadcasts the minimum distance to all other ranks\n\tMPI_Bcast(&minDist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < local_min_dist) {\n\t\t\t\tlocal_min_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tdouble globalMinDist;\n\tMPI_Allreduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "<p1, p2>",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its subset of points.\n\tstd::vector<Point> local_points = points;\n\tstd::sort(local_points.begin(), local_points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tdouble local_min_distance = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < local_points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < local_points.size(); ++j) {\n\t\t\tdouble dist = distance(local_points[i], local_points[j]);\n\t\t\tif (dist < local_min_distance) {\n\t\t\t\tlocal_min_distance = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Each rank sends its local minimum distance to rank 0.\n\tMPI_Gather(&local_min_distance, 1, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 computes the global minimum distance.\n\tdouble global_min_distance = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (int i = 0; i < size; ++i) {\n\t\t\tdouble recv_min_distance;\n\t\t\tMPI_Gather(&recv_min_distance, 1, MPI_DOUBLE, &recv_min_distance, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\t\t\tif (recv_min_distance < global_min_distance) {\n\t\t\t\tglobal_min_distance = recv_min_distance;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn global_min_distance;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the closest pair on each rank\n\tdouble minDist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks to rank 0\n\tdouble globalMinDist;\n\tMPI_Reduce(&minDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distanceBetweenPoints(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the distance between the closest two points in the vector points.\n   Store the result in distance.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\n__global__ void closestPair(const Point *points, size_t numPoints, double *distance) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) return;\n\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint num_blocks = gridDim.x;\n\n\tdouble min_dist = DBL_MAX;\n\tfor (int i = tid; i < numPoints; i += bid_size * num_blocks) {\n\t\tfor (int j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t*distance = min_dist;\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDistance = DBL_MAX;\n\tfor (int i = idx + 1; i < numPoints; i++) {\n\t\tdouble distance = distanceBetweenPoints(points[idx], points[i]);\n\t\tif (distance < minDistance) {\n\t\t\tminDistance = distance;\n\t\t}\n\t}\n\n\tif (idx == 0) {\n\t\t*distance = minDistance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tdouble minDist = DBL_MAX;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tfor (size_t j = i + 1; j < numPoints; j++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\tdouble minDist = DBL_MAX;\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*distance = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tdouble minDist = DBL_MAX;\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t\tdistance[tid] = minDist;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\n\tdouble minDist = DBL_MAX;\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t}\n\t}\n\n\t*distance = minDist;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < numPoints) {\n\t\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\t\tif (dist < *distance) {\n\t\t\t\t*distance = dist;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int j = i + 1; j < numPoints; j++) {\n\t\tdouble dist = distanceBetweenPoints(points[i], points[j]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tfor (int i = tid + 1; i < numPoints; i++) {\n\t\tdouble dist = distanceBetweenPoints(points[tid], points[i]);\n\t\tif (dist < *distance) {\n\t\t\t*distance = dist;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "13_geometry_closest_pair_2d",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example: \n\n   input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n   output: 1.41421\n*/\ndouble closestPair(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_distance = DBL_MAX;\n\t#pragma omp parallel for reduction(min: min_distance)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t    for (size_t j = i + 1; j < points.size(); ++j) {\n\t        min_distance = std::min(min_distance, distance(points[i], points[j]));\n\t    }\n\t}\n\n\tif (rank == 0) {\n\t    return min_distance;\n\t}\n\treturn 0.0;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Gather all the local minimum distances on rank 0\n\tstd::vector<double> allMinDists(size);\n\tMPI_Gather(&minDist, 1, MPI_DOUBLE, allMinDists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Find the global minimum distance\n\tif (rank == 0) {\n\t\tfor (double dist : allMinDists) {\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min: local_min_dist)\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minima to find the global minimum\n\tdouble global_min_dist;\n\tMPI_Allreduce(&local_min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum on rank 0\n\tif (rank == 0) {\n\t\treturn global_min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t// Each rank computes the closest pair of points in its local data\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distance across all ranks\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local subvector\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); ++i) {\n\t\tfor (int j = i + 1; j < points.size(); ++j) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the closest pair on this rank\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tfor (size_t i = 0; i < points.size(); ++i) {\n\t\tfor (size_t j = i + 1; j < points.size(); ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\t// Reduce the minimum distances from all ranks\n\tdouble global_min_dist;\n\tMPI_Allreduce(&min_dist, &global_min_dist, 1, MPI_DOUBLE, MPI_MIN);\n\n\t// Return the global minimum distance\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble min_dist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma opm parallel for reduction(min:min_dist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i+1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < min_dist) {\n\t\t\t\tmin_dist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\treturn min_dist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tminDist = std::min(minDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\telse {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (rank == 0) {\n\t\tMPI_Reduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t} else {\n\t\tMPI_Reduce(&minDist, nullptr, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\t}\n\n\treturn minDist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair of points in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocalMinDist = std::min(localMinDist, dist);\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble local_min_dist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:local_min_dist)\n\tfor (int i = 0; i < points.size(); i++) {\n\t\tfor (int j = i + 1; j < points.size(); j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tlocal_min_dist = std::min(local_min_dist, dist);\n\t\t}\n\t}\n\n\t// Gather all local minima on rank 0\n\tstd::vector<double> all_min_dists(size);\n\tMPI_Gather(&local_min_dist, 1, MPI_DOUBLE, all_min_dists.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Rank 0 finds the global minimum\n\tdouble global_min_dist = std::numeric_limits<double>::max();\n\tif (rank == 0) {\n\t\tfor (double dist : all_min_dists) {\n\t\t\tglobal_min_dist = std::min(global_min_dist, dist);\n\t\t}\n\t}\n\n\t// Return the global minimum on rank 0\n\treturn global_min_dist;\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "(1.41421, 1.41421)",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes the closest pair in its local data\n\tdouble localMinDist = std::numeric_limits<double>::max();\n\t#pragma omp parallel for reduction(min:localMinDist)\n\tfor (size_t i = 0; i < points.size(); i++) {\n\t\tfor (size_t j = i + 1; j < points.size(); j++) {\n\t\t\tlocalMinDist = std::min(localMinDist, distance(points[i], points[j]));\n\t\t}\n\t}\n\n\t// Reduce the local minimum distances to find the global minimum distance\n\tdouble globalMinDist;\n\tMPI_Allreduce(&localMinDist, &globalMinDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\t// Return the global minimum distance on rank 0\n\tif (rank == 0) {\n\t\treturn globalMinDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min:minDist)\n\tfor (int i = 0; i < n; ++i) {\n\t\tfor (int j = i + 1; j < n; ++j) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tdouble minDist = std::numeric_limits<double>::max();\n\tint n = points.size();\n\n\t#pragma omp parallel for reduction(min: minDist)\n\tfor (int i = 0; i < n; i++) {\n\t\tfor (int j = i + 1; j < n; j++) {\n\t\t\tdouble dist = distance(points[i], points[j]);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t}\n\t\t}\n\t}\n\n\tMPI_Allreduce(&minDist, &minDist, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn minDist;\n\t}\n\treturn -1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(Kokkos::View<const Point*> &points, Kokkos::View<Point*> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) {\n                    return points_sorted(i)->x < points_sorted(j)->x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull.slice(0, hull.extent(0) / 2);\n   auto lower_hull = hull.slice(hull.extent(0) / 2, hull.extent(0));\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 &&\n             (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 1)->x) <\n                (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) *\n                    (points_sorted(i)->y - upper_hull(upper_hull_size - 1)->y)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 &&\n             (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 1)->x) <\n                (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) *\n                    (points_sorted(i)->y - lower_hull(lower_hull_size - 1)->y)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 1;\n   for (int i = 0; i < lower_hull_size - 1; i++) {\n      hull(hull_size--) = lower_hull(i);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point*> upper_hull;\n   std::vector<Point*> lower_hull;\n\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 &&\n             (upper_hull[upper_hull.size() - 2]->y - upper_hull[upper_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - upper_hull[upper_hull.size() - 1]->x) <=\n                (upper_hull[upper_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (upper_hull[upper_hull.size() - 2]->x - upper_hull[upper_hull.size() - 1]->x)) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 &&\n             (lower_hull[lower_hull.size() - 2]->y - lower_hull[lower_hull.size() - 1]->y) *\n                    (points_sorted(i)->x - lower_hull[lower_hull.size() - 1]->x) >=\n                (lower_hull[lower_hull.size() - 1]->y - points_sorted(i)->y) *\n                    (lower_hull[lower_hull.size() - 2]->x - lower_hull[lower_hull.size() - 1]->x)) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   int j = 0;\n   for (int i = 0; i < upper_hull.size() - 1; i++) {\n      hull(j++) = upper_hull[i];\n   }\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      hull(j++) = lower_hull[i];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_size >= 2 && (points(hull_size - 1, 0) - points(hull_size - 2, 0)) * (points(i, 1) - points(hull_size - 2, 1)) - (points(hull_size - 1, 1) - points(hull_size - 2, 1)) * (points(i, 0) - points(hull_size - 2, 0)) < 0) {\n         hull_size--;\n      }\n      hull(hull_size, 0) = points(i, 0);\n      hull_size++;\n   }\n\n   // Resize the hull to its final size.\n   hull.resize(hull_size, num_dimensions);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the upper and lower hulls\n   Kokkos::View<Point*> upperHull(\"Upper Hull\", points.extent(0));\n   Kokkos::View<Point*> lowerHull(\"Lower Hull\", points.extent(0));\n   int upperHullSize = 0;\n   int lowerHullSize = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHullSize >= 2 &&\n             (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                    (points(i)->x - upperHull(upperHullSize - 2)->x) <\n             (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                    (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n         upperHullSize--;\n      }\n      upperHull(upperHullSize++) = points(i);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHullSize >= 2 &&\n             (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                    (points(i)->x - lowerHull(lowerHullSize - 2)->x) >\n             (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                    (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n         lowerHullSize--;\n      }\n      lowerHull(lowerHullSize++) = points(i);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   int hullSize = upperHullSize + lowerHullSize - 2;\n   for (int i = 0; i < upperHullSize; i++) {\n      hull(i) = upperHull(i);\n   }\n   for (int i = upperHullSize; i < hullSize; i++) {\n      hull(i) = lowerHull(i - upperHullSize + 1);\n   }\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                        KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n  // Find the upper and lower hulls\n  Kokkos::View<Point*, Kokkos::HostSpace> upperHull(\"upperHull\", points.extent(0));\n  Kokkos::View<Point*, Kokkos::HostSpace> lowerHull(\"lowerHull\", points.extent(0));\n\n  int upperHullSize = 0;\n  int lowerHullSize = 0;\n\n  for (int i = 0; i < points.extent(0); i++) {\n    // Add the point to the upper hull if it is to the right of the current rightmost point\n    while (upperHullSize >= 2 &&\n           (upperHull(upperHullSize - 1)->y - upperHull(upperHullSize - 2)->y) *\n                   (points(i)->x - upperHull(upperHullSize - 2)->x) <=\n           (upperHull(upperHullSize - 1)->x - upperHull(upperHullSize - 2)->x) *\n                   (points(i)->y - upperHull(upperHullSize - 2)->y)) {\n      upperHullSize--;\n    }\n    upperHull(upperHullSize++) = points(i);\n\n    // Add the point to the lower hull if it is to the left of the current leftmost point\n    while (lowerHullSize >= 2 &&\n           (lowerHull(lowerHullSize - 1)->y - lowerHull(lowerHullSize - 2)->y) *\n                   (points(i)->x - lowerHull(lowerHullSize - 2)->x) >=\n           (lowerHull(lowerHullSize - 1)->x - lowerHull(lowerHullSize - 2)->x) *\n                   (points(i)->y - lowerHull(lowerHullSize - 2)->y)) {\n      lowerHullSize--;\n    }\n    lowerHull(lowerHullSize++) = points(i);\n  }\n\n  // Merge the upper and lower hulls\n  int hullSize = upperHullSize + lowerHullSize - 2;\n  for (int i = 0; i < upperHullSize; i++) {\n    hull(i) = upperHull(i);\n  }\n  for (int i = upperHullSize; i < hullSize; i++) {\n    hull(i) = lowerHull(i - upperHullSize + 1);\n  }\n\n  // Return the hull\n  hull.resize(hullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = points_sorted.extent(0) - 1; i >= 0; i--) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = 0;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull[hull_size++] = upper_hull[i];\n   }\n   for (int i = lower_hull_size - 2; i >= 0; i--) {\n      hull[hull_size++] = lower_hull[i];\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_hull_points = 4;\n\n   // Find the leftmost point\n   int leftmost_index = 0;\n   for (int i = 1; i < num_points; ++i) {\n      if (points(i).x < points(leftmost_index).x) {\n         leftmost_index = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double dx1 = a.x - points(leftmost_index).x;\n      double dy1 = a.y - points(leftmost_index).y;\n      double dx2 = b.x - points(leftmost_index).x;\n      double dy2 = b.y - points(leftmost_index).y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   };\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_points), points_sorted, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan\n   int hull_index = 0;\n   for (int i = 0; i < num_points; ++i) {\n      while (hull_index >= 2 && (points_sorted(i).x - hull(hull_index - 1).x) * (hull(hull_index - 2).y - hull(hull_index - 1).y) - (points_sorted(i).y - hull(hull_index - 1).y) * (hull(hull_index - 2).x - hull(hull_index - 1).x) < 0) {\n         hull_index--;\n      }\n      hull(hull_index++) = points_sorted(i);\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_index);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Sort the points by their x-coordinates\n  auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the upper and lower hulls\n  auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, const auto &upper_hull) {\n        auto &hull = upper_hull.access();\n        if (i == 0) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, const auto &lower_hull) {\n        auto &hull = lower_hull.access();\n        if (i == points_sorted.extent(0) - 1) {\n          hull[0] = points_sorted(i);\n          return hull;\n        }\n        while (hull.extent(0) >= 2 &&\n               (hull(hull.extent(0) - 2).y - hull(hull.extent(0) - 1).y) *\n                       (points_sorted(i).x - hull(hull.extent(0) - 1).x) <\n                   (hull(hull.extent(0) - 1).y - points_sorted(i).y) *\n                       (hull(hull.extent(0) - 2).x - hull(hull.extent(0) - 1).x)) {\n          hull.resize(hull.extent(0) - 1);\n        }\n        hull.resize(hull.extent(0) + 1);\n        hull(hull.extent(0) - 1) = points_sorted(i);\n        return hull;\n      },\n      Kokkos::View<Point *, Kokkos::HostSpace>(nullptr, 0));\n\n  // Merge the upper and lower hulls\n  hull.resize(upper_hull.extent(0) + lower_hull.extent(0) - 2);\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.extent(0)),\n      KOKKOS_LAMBDA(const int &i) { hull(i) = upper_hull(i); });\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, lower_hull.extent(0) - 1),\n      KOKKOS_LAMBDA(const int &i) { hull(upper_hull.extent(0) + i) = lower_hull(i); });\n}\n",
+            "\n\n   using Kok\u30b3\u30af::parallel_for;\n   using Kok\u30b3\u30af::sort;\n\n   // Sort the points by their x-coordinates.\n   sort(Kok\u30b3\u30af::begin(points), Kok\u30b3\u30af::end(points), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper hull.\n   int upperHullSize = 0;\n   for (int i = 0; i < points.size(); ++i) {\n      while (upperHullSize >= 2 && Kok\u30b3\u30af::cross(hull[upperHullSize - 2], hull[upperHullSize - 1], points[i]) <= 0) {\n         --upperHullSize;\n      }\n      hull[upperHullSize++] = points[i];\n   }\n\n   // Find the lower hull.\n   int lowerHullSize = 0;\n   for (int i = points.size() - 1; i >= 0; --i) {\n      while (lowerHullSize >= 2 && Kok\u30b3\u30af::cross(hull[lowerHullSize - 2], hull[lowerHullSize - 1], points[i]) <= 0) {\n         --lowerHullSize;\n      }\n      hull[lowerHullSize++] = points[i];\n   }\n\n   // Remove the duplicate points from the lower hull.\n   while (lowerHullSize > 0 && hull[lowerHullSize - 1] == hull[lowerHullSize - 2]) {\n      --lowerHullSize;\n   }\n\n   // Combine the upper and lower hulls.\n   hull.resize(upperHullSize + lowerHullSize - 1);\n   std::copy(hull.begin(), hull.begin() + upperHullSize, hull.begin());\n   std::copy(hull.begin() + upperHullSize, hull.begin() + upperHullSize + lowerHullSize - 1, hull.begin() + upperHullSize);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)), KOKKOS_LAMBDA(const int &i, auto &upper_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         upper_hull.push_back(points_sorted(i));\n      } else {\n         while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n            upper_hull.pop_back();\n         }\n         upper_hull.push_back(points_sorted(i));\n      }\n   });\n\n   auto lower_hull = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1), KOKKOS_LAMBDA(const int &i, auto &lower_hull) {\n      if (i == 0 || i == points_sorted.extent(0) - 1) {\n         lower_hull.push_back(points_sorted(i));\n      } else {\n         while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) <= 0) {\n            lower_hull.pop_back();\n         }\n         lower_hull.push_back(points_sorted(i));\n      }\n   });\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 2);\n   std::copy(upper_hull.begin(), upper_hull.end() - 1, hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size() - 1);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                 KOKKOS_LAMBDA(const int &i, const int &j) { return points_sorted(i)->x < points_sorted(j)->x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_hull_size = 0;\n   int lower_hull_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_hull_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_hull_size++) = points_sorted(points.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_hull_size >= 2 &&\n             (points_sorted(i)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (upper_hull(upper_hull_size - 1)->x - upper_hull(upper_hull_size - 2)->x) <=\n                (upper_hull(upper_hull_size - 1)->y - upper_hull(upper_hull_size - 2)->y) *\n                    (points_sorted(i)->x - upper_hull(upper_hull_size - 2)->x)) {\n         upper_hull_size--;\n      }\n      upper_hull(upper_hull_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_hull_size >= 2 &&\n             (points_sorted(i)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (lower_hull(lower_hull_size - 1)->x - lower_hull(lower_hull_size - 2)->x) >=\n                (lower_hull(lower_hull_size - 1)->y - lower_hull(lower_hull_size - 2)->y) *\n                    (points_sorted(i)->x - lower_hull(lower_hull_size - 2)->x)) {\n         lower_hull_size--;\n      }\n      lower_hull(lower_hull_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_hull_size + lower_hull_size - 2;\n   for (int i = 0; i < upper_hull_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_hull_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_hull_size + 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinate\n   auto points_sorted = points;\n   parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int &i) {\n      points_sorted(i) = points(i);\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upper_hull, lower_hull;\n   for (int i = 0; i < points_sorted.extent(0); i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upper_hull.size() >= 2 && cross(upper_hull[upper_hull.size() - 2], upper_hull[upper_hull.size() - 1], points_sorted(i)) <= 0) {\n         upper_hull.pop_back();\n      }\n      upper_hull.push_back(points_sorted(i));\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lower_hull.size() >= 2 && cross(lower_hull[lower_hull.size() - 2], lower_hull[lower_hull.size() - 1], points_sorted(i)) >= 0) {\n         lower_hull.pop_back();\n      }\n      lower_hull.push_back(points_sorted(i));\n   }\n\n   // Merge the upper and lower hulls\n   for (int i = lower_hull.size() - 2; i >= 0; i--) {\n      upper_hull.push_back(lower_hull[i]);\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, upper_hull.size()), KOKKOS_LAMBDA(const int &i) {\n      hull(i) = upper_hull[i];\n   });\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = Kokkos::sort(points, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_sorted.extent(0)),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   auto lower_hull = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(points_sorted.extent(0) - 1, -1, -1),\n      KOKKOS_LAMBDA(const int &i, auto &hull) {\n         if (i == 0 || i == points_sorted.extent(0) - 1) {\n            hull.push_back(points_sorted(i));\n         } else {\n            while (hull.size() >= 2 &&\n                   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points_sorted(i).x - hull[hull.size() - 1].x) <\n                      (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points_sorted(i).y - hull[hull.size() - 1].y)) {\n               hull.pop_back();\n            }\n            hull.push_back(points_sorted(i));\n         }\n      },\n      hull);\n\n   // Merge the upper and lower hulls\n   hull.resize(upper_hull.size() + lower_hull.size() - 1);\n   std::copy(upper_hull.begin(), upper_hull.end(), hull.begin());\n   std::copy(lower_hull.begin() + 1, lower_hull.end(), hull.begin() + upper_hull.size());\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int num_points = points.extent(0);\n   const int num_dimensions = points.extent(1);\n   const int num_hull_points = 0;\n\n   // Find the point with the minimum y-coordinate.\n   int min_y_index = 0;\n   for (int i = 1; i < num_points; i++) {\n      if (points(i, 1) < points(min_y_index, 1)) {\n         min_y_index = i;\n      }\n   }\n\n   // Swap the point with the minimum y-coordinate to the first position.\n   Point temp = points(0, 0);\n   points(0, 0) = points(min_y_index, 0);\n   points(min_y_index, 0) = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   auto compare_polar_angle = [&](const Point &a, const Point &b) {\n      double angle_a = atan2(a.y - points(0, 1), a.x - points(0, 0));\n      double angle_b = atan2(b.y - points(0, 1), b.x - points(0, 0));\n      return angle_a < angle_b;\n   };\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, num_points), points, compare_polar_angle);\n\n   // Find the convex hull using Graham's scan.\n   int hull_size = 0;\n   for (int i = 0; i < num_points; i++) {\n      while (hull_size >= 2 && (points(hull[hull_size - 2], 0) - points(hull[hull_size - 1], 0)) * (points(i, 1) - points(hull[hull_size - 1], 1)) - (points(i, 0) - points(hull[hull_size - 1], 0)) * (points(hull[hull_size - 2], 1) - points(hull[hull_size - 1], 1)) < 0) {\n         hull_size--;\n      }\n      hull[hull_size++] = i;\n   }\n\n   // Resize the hull to the correct size.\n   hull = Kokkos::View<Point*>(hull, hull_size);\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), points_sorted, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   auto upper_hull = hull;\n   auto lower_hull = hull;\n   int upper_size = 0;\n   int lower_size = 0;\n\n   // Add the first point to the upper hull\n   upper_hull(upper_size++) = points_sorted(0);\n\n   // Add the last point to the lower hull\n   lower_hull(lower_size++) = points_sorted(points_sorted.extent(0) - 1);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points_sorted.extent(0) - 1; i++) {\n      // Add the point to the upper hull\n      while (upper_size >= 2 && (upper_hull(upper_size - 2).y - upper_hull(upper_size - 1).y) * (points_sorted(i).x - upper_hull(upper_size - 1).x) < (upper_hull(upper_size - 1).y - points_sorted(i).y) * (upper_hull(upper_size - 2).x - upper_hull(upper_size - 1).x)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = points_sorted(i);\n\n      // Add the point to the lower hull\n      while (lower_size >= 2 && (lower_hull(lower_size - 2).y - lower_hull(lower_size - 1).y) * (points_sorted(i).x - lower_hull(lower_size - 1).x) < (lower_hull(lower_size - 1).y - points_sorted(i).y) * (lower_hull(lower_size - 2).x - lower_hull(lower_size - 1).x)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = points_sorted(i);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(lower_size - (i - upper_size + 1));\n   }\n\n   // Resize the hull to the correct size\n   hull.resize(hull_size);\n}\n",
+            "\n\n   // Find the leftmost point\n   auto leftmost = points.reduce(\n       KOKKOS_LAMBDA(const Point &a, const Point &b) { return a.x < b.x ? a : b; },\n       points.extent(0));\n\n   // Sort the points by the angle they make with the leftmost point\n   Kokkos::View<Point *> sorted_points(\"sorted_points\", points.extent(0));\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       KOKKOS_LAMBDA(const int &i) {\n          sorted_points[i] = &points[i];\n       });\n\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n       sorted_points,\n       KOKKOS_LAMBDA(const Point *&a, const Point *&b) {\n          return std::atan2(a->y - leftmost->y, a->x - leftmost->x) <\n                 std::atan2(b->y - leftmost->y, b->x - leftmost->x);\n       });\n\n   // Find the convex hull using Graham scan\n   Kokkos::View<Point *> stack(\"stack\", points.extent(0));\n   int top = 0;\n   for (int i = 0; i < points.extent(0); i++) {\n      Point *p = sorted_points[i];\n      while (top >= 2 &&\n             (stack[top - 1]->y - stack[top - 2]->y) * (p->x - stack[top - 2]->x) <\n                 (stack[top - 1]->x - stack[top - 2]->x) * (p->y - stack[top - 2]->y))\n         top--;\n      stack[top++] = p;\n   }\n\n   // Copy the convex hull to the output view\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, top),\n       KOKKOS_LAMBDA(const int &i) { hull[i] = stack[i]; });\n}\n",
+            "\n\n   using namespace Kokkos;\n   using namespace Kokkos::Experimental;\n\n   const int N = points.extent(0);\n   const int HULL_SIZE = 2 * N;\n\n   // Sort the points by their x-coordinates\n   auto points_sorted = points;\n   parallel_sort(Kokkos::Cuda(), points_sorted, [=](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the lower and upper hulls\n   auto lower_hull = hull.slice(0, N);\n   auto upper_hull = hull.slice(N, HULL_SIZE);\n\n   // Find the lower hull\n   int lower_hull_size = 0;\n   for (int i = 0; i < N; i++) {\n      while (lower_hull_size >= 2 && cross(lower_hull[lower_hull_size - 2], lower_hull[lower_hull_size - 1], points_sorted[i]) <= 0) {\n         lower_hull_size--;\n      }\n      lower_hull[lower_hull_size++] = points_sorted[i];\n   }\n\n   // Find the upper hull\n   int upper_hull_size = 0;\n   for (int i = N - 1; i >= 0; i--) {\n      while (upper_hull_size >= 2 && cross(upper_hull[upper_hull_size - 2], upper_hull[upper_hull_size - 1], points_sorted[i]) <= 0) {\n         upper_hull_size--;\n      }\n      upper_hull[upper_hull_size++] = points_sorted[i];\n   }\n\n   // Merge the lower and upper hulls\n   int hull_size = lower_hull_size + upper_hull_size - 1;\n   for (int i = 0; i < hull_size; i++) {\n      hull[i] = (i < lower_hull_size) ? lower_hull[i] : upper_hull[i - lower_hull_size];\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   // Sort points by x-coordinate\n   auto points_view = Kokkos::subview(points, Kokkos::ALL(), Kokkos::ALL(), Kokkos::Rank<2>{0, 1});\n   auto sorted_points = points_view;\n   Kokkos::sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points_view.extent(0)),\n                 KOKKOS_LAMBDA(const int i, const int j) {\n                    return sorted_points(i, 0).x < sorted_points(j, 0).x;\n                 });\n\n   // Find the upper and lower hulls\n   auto upper_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Upper Hull\");\n   auto lower_hull = Kokkos::View<Point*>(Kokkos::ALL(), \"Lower Hull\");\n   int upper_size = 0, lower_size = 0;\n\n   // Find the upper hull\n   for (int i = 0; i < points_view.extent(0); i++) {\n      while (upper_size >= 2 &&\n             (upper_hull(upper_size - 1).y - upper_hull(upper_size - 2).y) *\n                    (sorted_points(i, 0).x - upper_hull(upper_size - 2).x) <\n             (upper_hull(upper_size - 1).x - upper_hull(upper_size - 2).x) *\n                    (sorted_points(i, 0).y - upper_hull(upper_size - 2).y)) {\n         upper_size--;\n      }\n      upper_hull(upper_size++) = sorted_points(i, 0);\n   }\n\n   // Find the lower hull\n   for (int i = points_view.extent(0) - 1; i >= 0; i--) {\n      while (lower_size >= 2 &&\n             (lower_hull(lower_size - 1).y - lower_hull(lower_size - 2).y) *\n                    (sorted_points(i, 0).x - lower_hull(lower_size - 2).x) <\n             (lower_hull(lower_size - 1).x - lower_hull(lower_size - 2).x) *\n                    (sorted_points(i, 0).y - lower_hull(lower_size - 2).y)) {\n         lower_size--;\n      }\n      lower_hull(lower_size++) = sorted_points(i, 0);\n   }\n\n   // Merge the upper and lower hulls\n   int hull_size = upper_size + lower_size - 2;\n   hull = Kokkos::View<Point*>(hull_size, \"Convex Hull\");\n   for (int i = 0; i < upper_size; i++) {\n      hull(i) = upper_hull(i);\n   }\n   for (int i = upper_size; i < hull_size; i++) {\n      hull(i) = lower_hull(i - upper_size + lower_size - 1);\n   }\n}\n",
+            "\n\n   using namespace Kokkos;\n\n   const int N = points.extent(0);\n   const int M = hull.extent(0);\n\n   auto points_d = create_mirror_view(points);\n   auto hull_d = create_mirror_view(hull);\n\n   parallel_for(N, KOKKOS_LAMBDA(const int i) {\n      points_d(i) = points(i);\n   });\n\n   parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &min_index) {\n      if (points_d(i).y < points_d(min_index).y || (points_d(i).y == points_d(min_index).y && points_d(i).x < points_d(min_index).x)) {\n         min_index = i;\n      }\n   }, 0);\n\n   int p = 0;\n   hull_d(p) = points_d(min_index);\n\n   int q;\n   do {\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   int count = 0;\n   do {\n      hull_d(count) = points_d(p);\n      count++;\n      q = (p + 1) % N;\n      for (int i = 0; i < N; i++) {\n         if (orientation(points_d(p), points_d(i), points_d(q)) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != min_index);\n\n   parallel_for(count, KOKKOS_LAMBDA(const int i) {\n      hull(i) = hull_d(i);\n   });\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n                         KOKKOS_LAMBDA(const int &i, const int &j) { return points(i)->x < points(j)->x; });\n\n   // Find the leftmost and rightmost points\n   const Point *leftmost = points(0);\n   const Point *rightmost = points(points.extent(0) - 1);\n\n   // Create a stack to store the points on the hull\n   std::stack<const Point *> stack;\n\n   // Add the leftmost and rightmost points to the stack\n   stack.push(leftmost);\n   stack.push(rightmost);\n\n   // Iterate over the remaining points\n   for (int i = 1; i < points.extent(0) - 1; i++) {\n      const Point *p = points(i);\n\n      // Check if the point is to the left of the line defined by the last two points on the stack\n      if (orientation(stack.top(), stack.top()->next, p) == LEFT) {\n         // If it is, pop the last point from the stack\n         stack.pop();\n         // And add the new point to the stack\n         stack.push(p);\n      }\n   }\n\n   // Copy the points from the stack to the hull\n   int hull_size = stack.size();\n   for (int i = 0; i < hull_size; i++) {\n      hull(i) = stack.top();\n      stack.pop();\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int min_index = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].y < points[min_index].y) {\n         min_index = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   std::swap(points[0], points[min_index]);\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points.begin() + 1, points.end(), [&](const Point& a, const Point& b) {\n      return (atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x));\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull.push_back(points[0]);\n   hull.push_back(points[1]);\n   hull.push_back(points[2]);\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < points.size(); i++) {\n      while (hull.size() >= 2 && (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) - (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) >= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (const Point& p : points) {\n      if (p.x < leftmost.x) {\n         leftmost = p;\n      }\n   }\n\n   // Sort the points by their angle with the leftmost point\n   std::sort(points.begin(), points.end(), [&leftmost](const Point& a, const Point& b) {\n      return std::atan2(a.y - leftmost.y, a.x - leftmost.x) < std::atan2(b.y - leftmost.y, b.x - leftmost.x);\n   });\n\n   // Find the convex hull using Graham's scan\n   std::stack<Point> convex_hull;\n   convex_hull.push(leftmost);\n   convex_hull.push(points[1]);\n   for (size_t i = 2; i < points.size(); i++) {\n      Point top = convex_hull.top();\n      while (convex_hull.size() >= 2 && (top.x - convex_hull.at(convex_hull.size() - 2).x) * (points[i].y - top.y) - (top.y - convex_hull.at(convex_hull.size() - 2).y) * (points[i].x - top.x) < 0) {\n         convex_hull.pop();\n         top = convex_hull.top();\n      }\n      convex_hull.push(points[i]);\n   }\n\n   // Copy the convex hull to the output vector\n   while (!convex_hull.empty()) {\n      hull.push_back(convex_hull.top());\n      convex_hull.pop();\n   }\n}\n",
+            ".",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the leftmost and rightmost points\n   Point leftmost = points[0];\n   Point rightmost = points[points.size() - 1];\n\n   // Find the points that are on the upper and lower hulls\n   std::vector<Point> upperHull;\n   std::vector<Point> lowerHull;\n\n   #pragma omp parallel sections\n   {\n      #pragma omp section\n      {\n         upperHull.push_back(leftmost);\n         for (int i = 1; i < points.size(); i++) {\n            while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n               upperHull.pop_back();\n            }\n            upperHull.push_back(points[i]);\n         }\n      }\n\n      #pragma omp section\n      {\n         lowerHull.push_back(rightmost);\n         for (int i = points.size() - 2; i >= 0; i--) {\n            while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) <= 0) {\n               lowerHull.pop_back();\n            }\n            lowerHull.push_back(points[i]);\n         }\n      }\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size());\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n\n   // Remove the duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n  // Sort the points by their x-coordinates\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n  // Find the lower hull\n  for (const Point& point : points) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(point - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(point);\n  }\n\n  // Find the upper hull\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() >= 2 && (hull.back() - hull[hull.size() - 2]).cross(points[i] - hull.back()) <= 0) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  // Remove duplicate points\n  hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   int n = points.size();\n   if (n < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   int l = 0, r = 0;\n   hull.push_back(points[l]);\n   for (int i = 1; i < n; i++) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.push_back(points[n - 1]);\n   for (int i = n - 2; i >= 0; i--) {\n      while (l >= 1 && (points[i].y - hull[l - 1].y) * (hull[l].x - hull[l - 1].x) <= (points[i].x - hull[l - 1].x) * (hull[l].y - hull[l - 1].y))\n         l--;\n      while (r >= 1 && (points[i].y - hull[r - 1].y) * (hull[r].x - hull[r - 1].x) <= (points[i].x - hull[r - 1].x) * (hull[r].y - hull[r - 1].y))\n         r--;\n      hull.push_back(points[i]);\n      l++;\n      r++;\n   }\n\n   hull.pop_back();\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l, q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int l = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[l].x) {\n         l = i;\n      }\n   }\n\n   int p = l;\n   int q;\n   do {\n      hull.push_back(points[p]);\n      q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == -1) {\n            q = i;\n         }\n      }\n      p = q;\n   } while (p != l);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from leftmost point, keep moving counterclockwise until we reach the start point again\n   int p = leftMost;\n   do {\n      // Add current point to result\n      hull.push_back(points[p]);\n\n      // Search for a point 'q' such that the vector 'p-q' has the smallest positive orientation.\n      int q = (p + 1) % points.size();\n      for (int i = 0; i < points.size(); i++) {\n         if (orientation(points[p], points[i], points[q]) == 2) {\n            q = i;\n         }\n      }\n\n      // If 'q' is not 'p', then update 'p' and continue\n      p = q;\n\n   } while (p != leftMost);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) <=\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) >=\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   Point p = points[leftmost];\n   hull.push_back(p);\n   points.erase(points.begin() + leftmost);\n\n   while (points.size() > 0) {\n      int next = -1;\n      for (int i = 0; i < points.size(); i++) {\n         if (next == -1 || (orientation(p, points[i], points[next]) == 2)) {\n            next = i;\n         }\n      }\n      p = points[next];\n      hull.push_back(p);\n      points.erase(points.begin() + next);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <=\n             (upperHull[upperHull.size() - 1].y - point.y) *\n                (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >=\n             (lowerHull[lowerHull.size() - 1].y - point.y) *\n                (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) * (point.x - upperHull[upperHull.size() - 1].x) <\n                (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) * (point.x - lowerHull[lowerHull.size() - 1].x) >\n                (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Find the leftmost point\n   int leftMost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftMost].x) {\n         leftMost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise until we reach the leftmost point again\n   Point p = points[leftMost];\n   do {\n      hull.push_back(p);\n      int next = 0;\n      for (int i = 1; i < points.size(); i++) {\n         if (points[i] != p && orientation(p, points[i], points[next]) == 2) {\n            next = i;\n         }\n      }\n      p = points[next];\n   } while (p != points[leftMost]);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   int n = points.size();\n   std::vector<Point> up(n), down(n);\n   int m = 0, k = 0;\n\n   for (int i = 0; i < n; i++) {\n      if ((i == 0) || ((i == 1) && (points[i].x == points[i - 1].x) && (points[i].y == points[i - 1].y)))\n         continue;\n      down[m++] = points[i];\n   }\n\n   for (int i = n - 1; i >= 0; i--) {\n      if ((i == n - 1) || ((i == n - 2) && (points[i].x == points[i + 1].x) && (points[i].y == points[i + 1].y)))\n         continue;\n      up[k++] = points[i];\n   }\n\n   hull.clear();\n   int t[m + k + 1];\n   int q = 0;\n   for (int i = 0; i < m; i++)\n      t[q++] = i;\n   for (int i = k - 1; i >= 0; i--)\n      t[q++] = i + m;\n   t[q] = t[0];\n   t[q + 1] = t[1];\n\n   for (int i = 0; i <= q; i++) {\n      int a = t[i], b = t[i + 1];\n      if (a == b)\n         continue;\n      int c = t[i + 2];\n      if (a != c) {\n         hull.push_back(down[a]);\n         continue;\n      }\n      int x1 = down[a].x, y1 = down[a].y;\n      int x2 = down[b].x, y2 = down[b].y;\n      int x3 = down[c].x, y3 = down[c].y;\n      int dx1 = x2 - x1, dy1 = y2 - y1;\n      int dx2 = x3 - x2, dy2 = y3 - y2;\n      int cross = dx1 * dy2 - dx2 * dy1;\n      if (cross < 0)\n         hull.push_back(down[a]);\n      else if (cross > 0)\n         hull.push_back(down[b]);\n      else {\n         hull.push_back(down[a]);\n         hull.push_back(down[b]);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   int n = points.size();\n   std::vector<Point> upper(n);\n   std::vector<Point> lower(n);\n\n   int m = 0;\n   for (int i = 0; i < n; i++) {\n      while (m >= 2 && (upper[m - 1].y - upper[m - 2].y) * (points[i].x - upper[m - 2].x) <=\n             (upper[m - 1].x - upper[m - 2].x) * (points[i].y - upper[m - 2].y)) {\n         m--;\n      }\n      upper[m++] = points[i];\n   }\n\n   m = 0;\n   for (int i = n - 1; i >= 0; i--) {\n      while (m >= 2 && (lower[m - 1].y - lower[m - 2].y) * (points[i].x - lower[m - 2].x) <=\n             (lower[m - 1].x - lower[m - 2].x) * (points[i].y - lower[m - 2].y)) {\n         m--;\n      }\n      lower[m++] = points[i];\n   }\n\n   for (int i = 0; i < m; i++) {\n      hull.push_back(upper[i]);\n   }\n   for (int i = m - 2; i >= 0; i--) {\n      hull.push_back(lower[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.reserve(upperHull.size() + lowerHull.size() - 2);\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Find the first point in the upper hull that is to the right of the current point\n      while (upperHull.size() >= 2 &&\n             (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x) *\n                (point.y - upperHull[upperHull.size() - 1].y) -\n                (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n                (point.x - upperHull[upperHull.size() - 1].x) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Find the first point in the lower hull that is to the left of the current point\n      while (lowerHull.size() >= 2 &&\n             (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x) *\n                (point.y - lowerHull[lowerHull.size() - 1].y) -\n                (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n                (point.x - lowerHull[lowerHull.size() - 1].x) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n   hull.pop_back();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  if (points.size() < 3) {\n    hull = points;\n    return;\n  }\n\n  std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n    return a.x < b.x || (a.x == b.x && a.y < b.y);\n  });\n\n  hull.push_back(points[0]);\n  hull.push_back(points[1]);\n\n  for (size_t i = 2; i < points.size(); i++) {\n    while (hull.size() >= 2 &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  int t = hull.size();\n  for (int i = points.size() - 2; i >= 0; i--) {\n    while (hull.size() > t &&\n           (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <\n               (points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n      hull.pop_back();\n    }\n    hull.push_back(points[i]);\n  }\n\n  hull.pop_back();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (auto const& point : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper.back(), point) <= 0) {\n         upper.pop_back();\n      }\n      upper.push_back(point);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (auto const& point : points) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower.back(), point) >= 0) {\n         lower.pop_back();\n      }\n      lower.push_back(point);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() <= 3) {\n      hull = points;\n      return;\n   }\n\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   std::vector<Point> upper;\n   for (const Point& p : points) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], p) <= 0)\n         upper.pop_back();\n      upper.push_back(p);\n   }\n   upper.pop_back();\n\n   std::vector<Point> lower;\n   for (int i = points.size() - 1; i >= 0; i--) {\n      const Point& p = points[i];\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], p) <= 0)\n         lower.pop_back();\n      lower.push_back(p);\n   }\n   lower.pop_back();\n\n   hull.insert(hull.end(), upper.begin(), upper.end());\n   hull.insert(hull.end(), lower.rbegin(), lower.rend());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == -1) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == 1) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Combine the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   int n = points.size();\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n      return a.x < b.x || (a.x == b.x && a.y < b.y);\n   });\n\n   std::vector<Point> upper;\n   for (int i = 0; i < n; i++) {\n      while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n         upper.pop_back();\n      upper.push_back(points[i]);\n   }\n\n   std::vector<Point> lower;\n   for (int i = n - 1; i >= 0; i--) {\n      while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) <= 0)\n         lower.pop_back();\n      lower.push_back(points[i]);\n   }\n\n   for (int i = 1; i < upper.size() - 1; i++)\n      hull.push_back(upper[i]);\n   for (int i = lower.size() - 2; i >= 1; i--)\n      hull.push_back(lower[i]);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end() - 1);\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull.back(), point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull.back(), point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend() - 1);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper hull\n   for (const Point& point : points) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (point.y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(point);\n   }\n\n   // Find the lower hull\n   for (int i = points.size() - 2; i >= 0; i--) {\n      while (hull.size() >= 2 && (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <=\n                                  (hull.back().x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n         hull.pop_back();\n      }\n      hull.push_back(points[i]);\n   }\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   if (points.size() < 3) {\n      hull = points;\n      return;\n   }\n\n   // Sort the points by their x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper convex hull\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Find the lower convex hull\n   std::reverse(points.begin(), points.end());\n   for (const Point &p : points) {\n      while (hull.size() >= 2 && cross(hull[hull.size() - 2], hull[hull.size() - 1], p) <= 0) {\n         hull.pop_back();\n      }\n      hull.push_back(p);\n   }\n\n   // Remove the last point from the lower convex hull\n   hull.pop_back();\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) {\n      return a.x < b.x;\n   });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   for (int i = lowerHull.size() - 2; i >= 0; i--) {\n      hull.push_back(lowerHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point a, Point b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (Point p : points) {\n      // Build the lower hull\n      while (lowerHull.size() >= 2 &&\n             orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) <= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(p);\n\n      // Build the upper hull\n      while (upperHull.size() >= 2 &&\n             orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) >= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(p);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), lowerHull.begin(), lowerHull.end());\n   for (int i = upperHull.size() - 2; i >= 0; i--) {\n      hull.push_back(upperHull[i]);\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int numThreads = hipGetNumThreads();\n   int threadIdx = hipGetThreadIdx();\n   int blockIdx = hipGetBlockIdx();\n   int blockDim = hipGetBlockDim();\n   int gridDim = hipGetGridDim();\n\n   // Each thread computes the convex hull of a subset of points\n   int start = threadIdx.x + blockIdx.x * blockDim.x;\n   int end = fmin(start + numThreads, numPoints);\n\n   // Compute the convex hull of the subset of points\n   vector<Point> subset(end - start);\n   for (int i = start; i < end; i++) {\n      subset[i - start] = points[i];\n   }\n   vector<Point> hullSubset = convexHull(subset);\n\n   // Merge the convex hull of the subset of points with the overall convex hull\n   for (int i = 0; i < hullSubset.size(); i++) {\n      hull[hullSize + i] = hullSubset[i];\n   }\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[upperHull.size() + i - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points, points + numPoints, [leftmost](const Point &a, const Point &b) {\n      return atan2(a.y - points[leftmost].y, a.x - points[leftmost].x) < atan2(b.y - points[leftmost].y, b.x - points[leftmost].x);\n   });\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last two points in the hull, add it to the hull\n      while (hullSize >= 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n\n   __syncthreads();\n\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) {\n         return a.x < b.x || (a.x == b.x && a.y < b.y);\n      });\n\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n\n      hullSize = m;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < numPoints) {\n      hull[gid] = points[gid];\n   }\n   __syncthreads();\n\n   if (tid == 0) {\n      sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      for (int i = numPoints - 2, t = m + 1; i >= 0; i--) {\n         while (t >= m + 1 && (hull[t - 1].y - hull[t - 2].y) * (hull[i].x - hull[t - 1].x) <\n                            (hull[t - 1].x - hull[t - 2].x) * (hull[i].y - hull[t - 1].y)) {\n            t--;\n         }\n         hull[t++] = hull[i];\n      }\n      hullSize = t - m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   thrust::convex_hull(thrust::device, points, points + numPoints,\n                       std::back_inserter(upperHull), std::back_inserter(lowerHull));\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n\n   // Return the size of the hull\n   *hullSize = upperHull.size() + lowerHull.size();\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // If the point is to the left of the current upper hull, add it to the upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) > 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // If the point is to the right of the current lower hull, add it to the lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) < 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Combine the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size() - 2;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[i] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[i + upperHull.size() - 1] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize > 2 && crossProduct(hull[hullSize - 2], hull[hullSize - 1], points[i]) <= 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by their polar angle with respect to the leftmost point\n   auto cmp = [&](const Point &a, const Point &b) {\n      return (a.y - points[leftmost].y) * (b.x - points[leftmost].x) - (a.x - points[leftmost].x) * (b.y - points[leftmost].y);\n   };\n   std::sort(points + leftmost + 1, points + numPoints, cmp);\n\n   // Build the convex hull using Graham's scan\n   int hullIndex = 0;\n   for (int i = leftmost + 1; i < numPoints; i++) {\n      while (hullIndex >= 2 && cmp(hull[hullIndex - 1], hull[hullIndex - 2], points[i])) {\n         hullIndex--;\n      }\n      hull[hullIndex++] = points[i];\n   }\n\n   // Copy the convex hull to the output buffer\n   std::copy(hull, hull + hullIndex, hull);\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n              (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n\n      // If the current point is to the right of the last point in the hull, remove the last point from the hull\n      else if ((hull[hullSize - 1].x - points[i].x) * (hull[hullSize - 1].y - points[leftmost].y) -\n                   (hull[hullSize - 1].y - points[i].y) * (hull[hullSize - 1].x - points[leftmost].x) >\n               0) {\n         hullSize--;\n      }\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the point is to the left of the line defined by the last two points in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hull[hullSize++] = points[i];\n      }\n   }\n\n   // If the last point is not the same as the first point, add it to the hull\n   if (hull[hullSize - 1] != hull[0]) {\n      hull[hullSize++] = hull[0];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize > 1 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the last point in the hull is not the leftmost point, add it to the hull\n   if (hull[hullSize - 1] != points[leftmost]) {\n      hull[hullSize] = points[leftmost];\n      hullSize++;\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (i != p && i != q &&\n             (points[i].y - points[p].y) * (points[q].x - points[p].x) >\n                (points[i].x - points[p].x) * (points[q].y - points[p].y)) {\n            q = i;\n         }\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[q];\n      p = q;\n\n   } while (p != leftmost);\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the remaining points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [&](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // TODO: Implement the convex hull algorithm\n   // The kernel is launched with at least as many threads as points.\n   // Store the result in `hull`.\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points, points + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (size_t i = 0; i < numPoints; i++) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) <= 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(points[i]);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) >= 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(points[i]);\n   }\n\n   // Merge the upper and lower hulls into the convex hull\n   hullSize = 0;\n   for (size_t i = 0; i < upperHull.size(); i++) {\n      hull[hullSize++] = upperHull[i];\n   }\n   for (size_t i = lowerHull.size() - 2; i >= 1; i--) {\n      hull[hullSize++] = lowerHull[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the minimum point with the first point.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Send the leftmost point to all other ranks\n   MPI_Bcast(&leftmost, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the convex hull on each rank\n   std::vector<Point> localHull;\n   grahamScan(points, localHull, leftmost);\n\n   // Gather all the local hulls on rank 0\n   if (rank == 0) {\n      hull = localHull;\n      for (int i = 1; i < size; i++) {\n         std::vector<Point> recvHull;\n         MPI_Recv(recvHull.data(), recvHull.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         hull.insert(hull.end(), recvHull.begin(), recvHull.end());\n      }\n   } else {\n      MPI_Send(localHull.data(), localHull.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n\n   // Remove duplicate points from the hull\n   std::sort(hull.begin(), hull.end());\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull.push_back(points[leftmost]);\n\n   // Find the convex hull using Graham's scan\n   int p = leftmost;\n   do {\n      // Find the next point that is not on the same line as the previous two points\n      int q = (p + 1) % points.size();\n      while (orientation(points[p], points[q], points[(q + 1) % points.size()]) == 0) {\n         q = (q + 1) % points.size();\n      }\n\n      // Add the next point to the hull\n      hull.push_back(points[q]);\n\n      // Update the current point\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort the points by x-coordinate\n   std::sort(points.begin(), points.end(), [](const Point& a, const Point& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (const Point& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (rank == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point, keep moving counter-clockwise\n   // until we reach the start point again\n   int p = leftmost;\n   do {\n      hull.push_back(points[p]);\n      int q = (p + 1) % points.size();\n\n      // Find the most counter-clockwise point from p\n      for (int i = 0; i < points.size(); i++) {\n         // Skip if i is the same as p or q\n         if (i == p || i == q) {\n            continue;\n         }\n\n         // Calculate the cross product of (q - p) x (i - p)\n         double cross = (points[q].x - points[p].x) * (points[i].y - points[p].y) -\n                        (points[q].y - points[p].y) * (points[i].x - points[p].x);\n\n         // If the cross product is positive, then i is more counter-clockwise than q\n         if (cross > 0) {\n            q = i;\n         }\n      }\n\n      // Update p to be q\n      p = q;\n   } while (p != leftmost);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // If rank 0, find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the leftmost point\n   Point leftmost = points[0];\n   for (size_t i = 1; i < points.size(); i++) {\n      if (points[i].x < leftmost.x) {\n         leftmost = points[i];\n      }\n   }\n\n   // Broadcast the leftmost point to all ranks\n   MPI_Bcast(&leftmost, sizeof(Point), MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull on each rank\n   std::vector<Point> localHull;\n   convexHull(points, leftmost, localHull);\n\n   // Gather the local convex hull from all ranks\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\n__global__ void convexHull(const Point *points, size_t numPoints, Point *hull, size_t hullSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      qsort(hull, numPoints, sizeof(Point), [](const void *a, const void *b) {\n         Point *pa = (Point *)a;\n         Point *pb = (Point *)b;\n         if (pa->x != pb->x) {\n            return pa->x < pb->x ? -1 : 1;\n         }\n         return pa->y < pb->y ? -1 : 1;\n      });\n      int m = 0;\n      for (int i = 0; i < numPoints; ++i) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <\n                            (hull[m].y - hull[m - 1].y) * (hull[m - 1].x - hull[m - 2].x)) {\n            --m;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n   __syncthreads();\n   if (tid < hullSize) {\n      hull[tid] = hull[tid];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(b.y - points[0].y, b.x - points[0].x) < atan2(a.y - points[0].y, a.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHull(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.begin(), lowerHull.end(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the point with the smallest y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the smallest y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                                 (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angleA = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angleB = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angleA < angleB;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < numPoints) {\n    hull[tid] = points[tid];\n  }\n  __syncthreads();\n\n  if (numPoints <= 1) {\n    return;\n  }\n\n  // Sort the points by their x-coordinates\n  thrust::sort(hull, hull + numPoints,\n               [](const Point &a, const Point &b) { return a.x < b.x; });\n\n  // Find the leftmost and rightmost points\n  Point leftmost = hull[0];\n  Point rightmost = hull[numPoints - 1];\n\n  // Find the convex hull of the points to the left of the leftmost point\n  int leftHullSize = 0;\n  convexHull(hull, numPoints - 1, hull, leftHullSize);\n\n  // Find the convex hull of the points to the right of the rightmost point\n  int rightHullSize = 0;\n  convexHull(hull + 1, numPoints - 1, hull + leftHullSize, rightHullSize);\n\n  // Merge the two convex hulls\n  int hullIndex = 0;\n  int leftIndex = 0;\n  int rightIndex = 0;\n  while (leftIndex < leftHullSize && rightIndex < rightHullSize) {\n    if (hull[leftIndex].y < hull[rightIndex].y) {\n      hull[hullIndex++] = hull[leftIndex++];\n    } else {\n      hull[hullIndex++] = hull[rightIndex++];\n    }\n  }\n  while (leftIndex < leftHullSize) {\n    hull[hullIndex++] = hull[leftIndex++];\n  }\n  while (rightIndex < rightHullSize) {\n    hull[hullIndex++] = hull[rightIndex++];\n  }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(thrust::device_ptr<Point>(points), numPoints, &upperHull, &lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      return atan2(a.y - points[0].y, a.x - points[0].x) < atan2(b.y - points[0].y, b.x - points[0].x);\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points and add them to the convex hull if they are not inside the current convex hull.\n   for (int i = 3; i < numPoints; i++) {\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) - (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Find the point with the minimum y-coordinate.\n   int minIndex = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].y < points[minIndex].y) {\n         minIndex = i;\n      }\n   }\n\n   // Swap the first point with the point with the minimum y-coordinate.\n   Point temp = points[0];\n   points[0] = points[minIndex];\n   points[minIndex] = temp;\n\n   // Sort the points by their polar angle with respect to the first point.\n   std::sort(points + 1, points + numPoints, [points](const Point &a, const Point &b) {\n      double angle1 = atan2(a.y - points[0].y, a.x - points[0].x);\n      double angle2 = atan2(b.y - points[0].y, b.x - points[0].x);\n      return angle1 < angle2;\n   });\n\n   // Initialize the convex hull with the first three points.\n   hull[0] = points[0];\n   hull[1] = points[1];\n   hull[2] = points[2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points.\n   for (int i = 3; i < numPoints; i++) {\n      // While the convex hull is not convex, remove the last point from the hull.\n      while (hullSize >= 3 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n                               (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) < 0) {\n         hullSize--;\n      }\n\n      // Add the current point to the convex hull.\n      hull[hullSize++] = points[i];\n   }\n\n   // Return the convex hull.\n   return;\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Start from the leftmost point and keep moving counter-clockwise\n   int hullSize = 0;\n   int current = leftmost;\n   do {\n      // Add the current point to the hull\n      hull[hullSize] = points[current];\n      hullSize++;\n\n      // Find the next point that is most counter-clockwise from the current point\n      int next = (current + 1) % numPoints;\n      for (int i = 0; i < numPoints; i++) {\n         if (orientation(points[current], points[next], points[i]) == 2) {\n            next = i;\n         }\n      }\n\n      // Set the current point to the next point\n      current = next;\n   } while (current != leftmost);\n\n   // Return the size of the hull\n   return hullSize;\n}\n",
+            "\n\n   // Sort the points by their x-coordinate\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull;\n   thrust::device_vector<Point> lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= numPoints) {\n      return;\n   }\n\n   // Find the point with the smallest y-coordinate\n   int minIdx = idx;\n   for (int i = idx + 1; i < numPoints; i++) {\n      if (points[i].y < points[minIdx].y) {\n         minIdx = i;\n      }\n   }\n\n   // Swap the point with the smallest y-coordinate to the front of the array\n   Point temp = points[idx];\n   points[idx] = points[minIdx];\n   points[minIdx] = temp;\n\n   // Sort the points by their angle with respect to the point with the smallest y-coordinate\n   std::sort(points + idx + 1, points + numPoints, [points, idx](const Point &a, const Point &b) {\n      return (a.x - points[idx].x) * (b.y - points[idx].y) > (b.x - points[idx].x) * (a.y - points[idx].y);\n   });\n\n   // Initialize the hull with the first three points\n   hull[0] = points[idx];\n   hull[1] = points[idx + 1];\n   hull[2] = points[idx + 2];\n   int hullSize = 3;\n\n   // Iterate over the remaining points\n   for (int i = idx + 3; i < numPoints; i++) {\n      // Check if the current point is inside the hull\n      while (hullSize > 2 && (points[i].x - hull[hullSize - 2].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) <=\n                           (points[i].y - hull[hullSize - 2].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x)) {\n         hullSize--;\n      }\n\n      // Add the current point to the hull\n      hull[hullSize] = points[i];\n      hullSize++;\n   }\n\n   // Return the hull\n   hullSize = hullSize;\n}\n\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device_ptr<Point>(points), thrust::device_ptr<Point>(points + numPoints),\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls\n   thrust::copy(upperHull.begin(), upperHull.end(), thrust::device_ptr<Point>(hull));\n   thrust::copy(lowerHull.rbegin(), lowerHull.rend(), thrust::device_ptr<Point>(hull + upperHull.size()));\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n\n   // Find the convex hull using Graham's scan\n   int hullSize = 1;\n   for (int i = 1; i < numPoints; i++) {\n      // Find the next point that is not on the same line as the previous two points\n      while (hullSize >= 2 &&\n             (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 2].x - hull[hullSize - 1].x) <=\n                (points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 2].y - hull[hullSize - 1].y)) {\n         hullSize--;\n      }\n\n      // Add the next point to the hull\n      hull[hullSize++] = points[i];\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates.\n   thrust::sort(thrust::device, points, points + numPoints,\n                 [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the leftmost point.\n   Point leftmostPoint = points[0];\n\n   // Find the rightmost point.\n   Point rightmostPoint = points[numPoints - 1];\n\n   // Find the points that are above the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsAboveLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsAboveLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y > leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the points that are below the line defined by the leftmost and rightmost points.\n   thrust::device_vector<Point> pointsBelowLine;\n   thrust::copy_if(thrust::device, points, points + numPoints,\n                   thrust::make_device_vector(pointsBelowLine),\n                   [leftmostPoint, rightmostPoint](const Point &point) {\n                      return point.y < leftmostPoint.y + (rightmostPoint.y - leftmostPoint.y) *\n                                                           (point.x - leftmostPoint.x) /\n                                                           (rightmostPoint.x - leftmostPoint.x);\n                   });\n\n   // Find the convex hull of the points above the line.\n   thrust::device_vector<Point> hullAboveLine;\n   convexHull(thrust::device_ptr<Point>(pointsAboveLine.data()), pointsAboveLine.size(),\n              thrust::device_ptr<Point>(hullAboveLine.data()), hullAboveLine.size());\n\n   // Find the convex hull of the points below the line.\n   thrust::device_vector<Point> hullBelowLine;\n   convexHull(thrust::device_ptr<Point>(pointsBelowLine.data()), pointsBelowLine.size(),\n              thrust::device_ptr<Point>(hullBelowLine.data()), hullBelowLine.size());\n\n   // Merge the two convex hulls.\n   thrust::copy(thrust::device, hullAboveLine.begin(), hullAboveLine.end(), hull);\n   thrust::copy(thrust::device, hullBelowLine.begin(), hullBelowLine.end(), hull + hullAboveLine.size());\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < numPoints) {\n      hull[tid] = points[tid];\n   }\n   __syncthreads();\n   if (tid == 0) {\n      std::sort(hull, hull + numPoints, [](const Point &a, const Point &b) { return a.x < b.x; });\n      int m = 0;\n      for (int i = 0; i < numPoints; i++) {\n         while (m >= 2 && (hull[m - 1].y - hull[m - 2].y) * (hull[i].x - hull[m - 1].x) <= (hull[m - 1].x - hull[m - 2].x) * (hull[i].y - hull[m - 1].y)) {\n            m--;\n         }\n         hull[m++] = hull[i];\n      }\n      hullSize = m;\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   thrust::sort(thrust::device, points, points + numPoints,\n                [](const Point &a, const Point &b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   thrust::device_vector<Point> upperHull, lowerHull;\n   convexHullHelper(points, numPoints, upperHull, lowerHull);\n\n   // Merge the upper and lower hulls into the final hull\n   hullSize = upperHull.size() + lowerHull.size();\n   thrust::copy(thrust::device, upperHull.begin(), upperHull.end(), hull);\n   thrust::copy(thrust::device, lowerHull.rbegin(), lowerHull.rend(), hull + upperHull.size());\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < numPoints; i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Initialize the hull with the leftmost point\n   hull[0] = points[leftmost];\n   int hullSize = 1;\n\n   // Iterate over the remaining points\n   for (int i = 1; i < numPoints; i++) {\n      // If the current point is to the left of the last point in the hull, add it to the hull\n      if ((points[i].x - hull[hullSize - 1].x) * (hull[hullSize - 1].y - hull[hullSize - 2].y) -\n              (points[i].y - hull[hullSize - 1].y) * (hull[hullSize - 1].x - hull[hullSize - 2].x) <\n          0) {\n         hull[hullSize] = points[i];\n         hullSize++;\n      }\n   }\n\n   // If the hull has more than 2 points, remove the last point\n   if (hullSize > 2) {\n      hullSize--;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "10_geometry_convex_hull",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of points. The final hull is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector<Point> const& points, std::vector<Point> &hull) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Sort the points by their x-coordinates\n   std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n\n   // Find the upper and lower hulls\n   std::vector<Point> upperHull, lowerHull;\n   for (auto const& point : points) {\n      // Add the point to the upper hull if it is above the current upper hull\n      while (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n         upperHull.pop_back();\n      }\n      upperHull.push_back(point);\n\n      // Add the point to the lower hull if it is below the current lower hull\n      while (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n         lowerHull.pop_back();\n      }\n      lowerHull.push_back(point);\n   }\n\n   // Merge the upper and lower hulls\n   hull.insert(hull.end(), upperHull.begin(), upperHull.end());\n   hull.insert(hull.end(), lowerHull.rbegin(), lowerHull.rend());\n\n   // Remove duplicate points\n   hull.erase(std::unique(hull.begin(), hull.end()), hull.end());\n\n   // Return the hull on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      return hull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   int n = points.size();\n   std::vector<Point> localHull;\n   // Find the convex hull of the local points\n   #pragma omp parallel\n   {\n      #pragma omp single\n      {\n         std::sort(points.begin(), points.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n         std::vector<Point> upper, lower;\n         for (int i = 0; i < n; i++) {\n            while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], points[i]) <= 0)\n               upper.pop_back();\n            upper.push_back(points[i]);\n            while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], points[i]) >= 0)\n               lower.pop_back();\n            lower.push_back(points[i]);\n         }\n         localHull.insert(localHull.end(), upper.begin(), upper.end());\n         for (int i = lower.size() - 2; i >= 0; i--)\n            localHull.push_back(lower[i]);\n      }\n   }\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   // Find the convex hull of the global points\n   if (rank == 0) {\n      std::sort(globalHull.begin(), globalHull.end(), [](Point const& a, Point const& b) { return a.x < b.x; });\n      std::vector<Point> upper, lower;\n      for (int i = 0; i < globalHull.size(); i++) {\n         while (upper.size() >= 2 && cross(upper[upper.size() - 2], upper[upper.size() - 1], globalHull[i]) <= 0)\n            upper.pop_back();\n         upper.push_back(globalHull[i]);\n         while (lower.size() >= 2 && cross(lower[lower.size() - 2], lower[lower.size() - 1], globalHull[i]) >= 0)\n            lower.pop_back();\n         lower.push_back(globalHull[i]);\n      }\n      hull.insert(hull.end(), upper.begin(), upper.end());\n      for (int i = lower.size() - 2; i >= 0; i--)\n         hull.push_back(lower[i]);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   // Find the leftmost point\n   int leftmost = 0;\n   for (int i = 1; i < points.size(); i++) {\n      if (points[i].x < points[leftmost].x) {\n         leftmost = i;\n      }\n   }\n\n   // Sort the points by polar angle with respect to the leftmost point\n   std::sort(points.begin(), points.end(), [leftmost](Point a, Point b) {\n      double dx1 = a.x - points[leftmost].x;\n      double dy1 = a.y - points[leftmost].y;\n      double dx2 = b.x - points[leftmost].x;\n      double dy2 = b.y - points[leftmost].y;\n      return (atan2(dy1, dx1) < atan2(dy2, dx2));\n   });\n\n   // Use Graham's scan to find the convex hull\n   hull.push_back(points[leftmost]);\n   int m = 1;\n   for (int i = 1; i < points.size(); i++) {\n      while (m >= 2 && (points[i].x - hull[m - 1].x) * (hull[m].y - hull[m - 1].y) - (points[i].y - hull[m - 1].y) * (hull[m].x - hull[m - 1].x) <= 0) {\n         hull.pop_back();\n         m--;\n      }\n      hull.push_back(points[i]);\n      m++;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(size * localHull.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHull(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Reduce the local hulls to a single global hull\n   std::vector<Point> globalHull;\n   MPI_Reduce(&localHull[0], &globalHull[0], localHull.size() * sizeof(Point), MPI_BYTE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // If this is rank 0, copy the global hull to the output hull\n   if (rank == 0) {\n      hull = globalHull;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull for each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls\n   std::vector<Point> globalHull;\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], globalHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.reserve(points.size());\n   }\n   MPI_Gather(&localHull[0], localHull.size(), MPI_DOUBLE, &globalHull[0], localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHullHelper(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   std::vector<Point> globalHull;\n   if (rank == 0) {\n      globalHull.resize(localHull.size() * size);\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, globalHull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (rank == 0) {\n      convexHullHelper(globalHull, hull);\n   }\n}\n",
+            "\n\n   int my_rank, num_procs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> local_hull;\n   convexHull(points, local_hull);\n\n   // Gather all the local hulls on rank 0\n   std::vector<Point> global_hull;\n   if (my_rank == 0) {\n      global_hull.resize(local_hull.size() * num_procs);\n   }\n   MPI_Gather(local_hull.data(), local_hull.size(), MPI_DOUBLE, global_hull.data(), local_hull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the global hull\n   if (my_rank == 0) {\n      convexHull(global_hull, hull);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Find the convex hull of the points on each rank\n   std::vector<Point> localHull;\n   convexHull(points, localHull);\n\n   // Gather all the local hulls into a single vector on rank 0\n   if (rank == 0) {\n      hull.resize(size * localHull.size());\n   }\n   MPI_Gather(localHull.data(), localHull.size(), MPI_DOUBLE, hull.data(), localHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Find the convex hull of the gathered points on rank 0\n   if (rank == 0) {\n      convexHull(hull, hull);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(Kokkos::View<const Point*> &points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point*> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\thull.push_back(points(0));\n\thull.push_back(points(1));\n\tfor (int i = 2; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(hull[hull.size() - 2]->x - hull[hull.size() - 1]->x) * (points(i)->y - hull[hull.size() - 1]->y) -\n\t\t\t(hull[hull.size() - 2]->y - hull[hull.size() - 1]->y) * (points(i)->x - hull[hull.size() - 1]->x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\tusing policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<16>>;\n\tusing team_member_type = typename policy_type::member_type;\n\n\tconst size_t num_points = points.extent(0);\n\tconst size_t num_teams = (num_points + policy_type::team_size - 1) / policy_type::team_size;\n\n\t// Find the leftmost point\n\tteam_member_type team_member;\n\tPoint leftmost_point = points(0);\n\tfor (size_t i = 1; i < num_points; ++i) {\n\t\tif (points(i).x < leftmost_point.x) {\n\t\t\tleftmost_point = points(i);\n\t\t}\n\t}\n\n\t// Sort the points by polar angle with respect to the leftmost point\n\tstd::vector<Point> sorted_points(num_points);\n\tfor (size_t i = 0; i < num_points; ++i) {\n\t\tsorted_points[i] = points(i);\n\t}\n\tstd::sort(sorted_points.begin(), sorted_points.end(), [leftmost_point](Point const& p1, Point const& p2) {\n\t\tdouble angle1 = std::atan2(p1.y - leftmost_point.y, p1.x - leftmost_point.x);\n\t\tdouble angle2 = std::atan2(p2.y - leftmost_point.y, p2.x - leftmost_point.x);\n\t\treturn angle1 < angle2;\n\t});\n\n\t// Compute the perimeter of the convex hull using Graham's scan\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.push_back(sorted_points[0]);\n\tconvex_hull.push_back(sorted_points[1]);\n\tfor (size_t i = 2; i < num_points; ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (sorted_points[i].y - convex_hull[convex_hull.size() - 1].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) *\n\t\t\t\t   (sorted_points[i].x - convex_hull[convex_hull.size() - 1].x) < 0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(sorted_points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.data(), points.data() + points.extent(0), [](const Point* p1, const Point* p2) { return p1->x < p2->x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<const Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tfor (int i = points.extent(0) - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tconst Point* p1 = hull[i];\n\t\t\tconst Point* p2 = hull[i + 1];\n\t\t\tconst Point* p3 = points[i];\n\t\t\tdouble crossProduct = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\tsum += crossProduct;\n\t\t}, Kokkos::Sum<double>(0.0)) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.size(), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)),\n\t\tKOKKOS_LAMBDA(const int& i, const int& j) {\n\t\t\treturn points(i)->x < points(j)->x;\n\t\t});\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point*> convexHull;\n\tconvexHull.push_back(points(0));\n\tconvexHull.push_back(points(points.extent(0) - 1));\n\tfor (int i = 1; i < points.extent(0) - 1; i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2]->y - convexHull[convexHull.size() - 1]->y) *\n\t\t\t(convexHull[convexHull.size() - 1]->x - points(i)->x) <\n\t\t\t(convexHull[convexHull.size() - 1]->y - points(i)->y) *\n\t\t\t(convexHull[convexHull.size() - 2]->x - convexHull[convexHull.size() - 1]->x)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(*convexHull[i], *convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tgrahamScan(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull_points;\n\tconvex_hull(points, hull_points);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull_points.size(); ++i) {\n\t\tsize_t j = (i + 1) % hull_points.size();\n\t\tperimeter += distance(hull_points[i], hull_points[j]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvex_hull(points.data(), points.extent(0), std::back_inserter(hull));\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.extent(0));\n\tif (points.extent(0) < 3) {\n\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t} else {\n\t\t// Find the leftmost point.\n\t\tint leftmost = 0;\n\t\tfor (int i = 1; i < points.extent(0); ++i) {\n\t\t\tif (points[i].x < points[leftmost].x) {\n\t\t\t\tleftmost = i;\n\t\t\t}\n\t\t}\n\n\t\t// Start from the leftmost point and keep moving clockwise until we reach the leftmost point again.\n\t\tint current = leftmost;\n\t\tdo {\n\t\t\tconvexHull.push_back(points[current]);\n\t\t\tint next = (current + 1) % points.extent(0);\n\t\t\tfor (int i = 0; i < points.extent(0); ++i) {\n\t\t\t\tif (orientation(points[current], points[next], points[i]) == 2) {\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t\tcurrent = next;\n\t\t} while (current != leftmost);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tusing namespace Kokkos;\n\n\t// Compute the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tKokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const int& i, const int& j) {\n\t\treturn points(i)->x < points(j)->x;\n\t});\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point*> hull;\n\tfor (int i = 0; i < points.extent(0); i++) {\n\t\twhile (hull.size() >= 2 && Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, hull.size() - 1), KOKKOS_LAMBDA(const int& i, double& sum) {\n\t\t\tPoint* p1 = hull[i];\n\t\t\tPoint* p2 = hull[i + 1];\n\t\t\tPoint* p3 = points(i);\n\t\t\tdouble cross = (p2->x - p1->x) * (p3->y - p1->y) - (p2->y - p1->y) * (p3->x - p1->x);\n\t\t\treturn sum + cross;\n\t\t}, 0.0) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points(i));\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(*hull[i], *hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\tPoint const& p = points[i];\n\t\tif (convexHull.size() < 2 || (convexHull.back().x - convexHull[convexHull.size() - 2].x) * (p.y - convexHull.back().y) - (convexHull.back().y - convexHull[convexHull.size() - 2].y) * (p.x - convexHull.back().x) > 0) {\n\t\t\tconvexHull.push_back(p);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && (upperHull.back().y - upperHull[upperHull.size() - 2].y) * (point.x - upperHull.back().x) < (upperHull.back().x - upperHull[upperHull.size() - 2].x) * (point.y - upperHull.back().y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t\twhile (lowerHull.size() >= 2 && (lowerHull.back().y - lowerHull[lowerHull.size() - 2].y) * (point.x - lowerHull.back().x) > (lowerHull.back().x - lowerHull[lowerHull.size() - 2].x) * (point.y - lowerHull.back().y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 2; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull[hull.size() - 1], points[i]) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t(points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t(points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) > 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(points.size(), false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint current = leftmost;\n\tdo {\n\t\thull.push_back(points[current]);\n\t\tvisited[current] = true;\n\n\t\t// Find the next point on the hull\n\t\tint next = -1;\n\t\tfor (int i = 0; i < points.size(); i++) {\n\t\t\tif (!visited[i] && (next == -1 || points[i].x * hull.back().y - points[i].y * hull.back().x > points[next].x * hull.back().y - points[next].y * hull.back().x)) {\n\t\t\t\tnext = i;\n\t\t\t}\n\t\t}\n\t\tcurrent = next;\n\t} while (current != leftmost);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (point.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) * (upperHull[upperHull.size() - 2].x - point.x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (point.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) * (lowerHull[lowerHull.size() - 2].x - point.x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t   (point.x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t   (point.x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - point.y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\tint p = leftmost;\n\tdo {\n\t\thull.push_back(points[p]);\n\t\tvisited[p] = true;\n\t\tint q = (p + 1) % n;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i] && (orientation(points[p], points[i], points[q]) == 2)) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\tp = q;\n\t} while (p != leftmost);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (Point const& p : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], p) <= 0)\n\t\t\tupperHull.pop_back();\n\t\tupperHull.push_back(p);\n\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], p) >= 0)\n\t\t\tlowerHull.pop_back();\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); ++i) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::vector<bool> visited(n, false);\n\tint p = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x < points[p].x) {\n\t\t\tp = i;\n\t\t}\n\t}\n\tint q = 0;\n\tfor (int i = 1; i < n; i++) {\n\t\tif (points[i].x > points[q].x) {\n\t\t\tq = i;\n\t\t}\n\t}\n\thull.push_back(points[p]);\n\thull.push_back(points[q]);\n\tvisited[p] = true;\n\tvisited[q] = true;\n\twhile (p != q) {\n\t\tint next = -1;\n\t\tdouble max_dist = -1;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tdouble dist = distance(points[i], hull.back());\n\t\t\t\tif (dist > max_dist) {\n\t\t\t\t\tmax_dist = dist;\n\t\t\t\t\tnext = i;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[next]);\n\t\tvisited[next] = true;\n\t\tp = q;\n\t\tq = next;\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates.\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && crossProduct(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (auto const& point : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tif (n <= 2) {\n\t\treturn 0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tupperHull.push_back(points[0]);\n\tupperHull.push_back(points[1]);\n\tlowerHull.push_back(points[0]);\n\tlowerHull.push_back(points[n - 1]);\n\n\tfor (int i = 2; i < n; i++) {\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t   (upperHull[upperHull.size() - 2].y - upperHull[upperHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - upperHull[upperHull.size() - 1].x) <=\n\t\t\t\t   (upperHull[upperHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (upperHull[upperHull.size() - 2].x - upperHull[upperHull.size() - 1].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t   (lowerHull[lowerHull.size() - 2].y - lowerHull[lowerHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - lowerHull[lowerHull.size() - 1].x) >=\n\t\t\t\t   (lowerHull[lowerHull.size() - 1].y - points[i].y) *\n\t\t\t\t\t   (lowerHull[lowerHull.size() - 2].x - lowerHull[lowerHull.size() - 1].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (int i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\treturn perimeter;\n}\n",
+            "\n\n\tint n = points.size();\n\tstd::vector<int> hull;\n\tconvexHull(points, hull);\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(points[hull[i]], points[hull[(i + 1) % hull.size()]]);\n\t}\n\treturn perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the lower and upper hulls\n\tstd::vector<Point> lowerHull, upperHull;\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && orientation(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], point) == -1) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\tfor (auto const& point : points) {\n\t\twhile (upperHull.size() >= 2 && orientation(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], point) == 1) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(point);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size() - 1; ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tfor (size_t i = upperHull.size() - 1; i > 0; --i) {\n\t\tperimeter += distance(upperHull[i], upperHull[i - 1]);\n\t}\n\treturn perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (auto& point : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (point.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (point.x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\tint t = hull.size() - 1;\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() > t &&\n\t\t\t   (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t\t   (hull.back().y - hull[hull.size() - 2].y) * (points[i].x - hull.back().x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <= (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (p.x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the convex hull using Graham's scan algorithm\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && orientation(convexHull[convexHull.size() - 2], convexHull[convexHull.size() - 1], points[i]) <= 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.empty()) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) <=\n\t\t\t(points[i].y - hull.back().y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() < 3) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.x < b.x;\n\t});\n\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& a, Point const& b) {\n\t\treturn a.y < b.y;\n\t});\n\n\tfor (auto const& point : points) {\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\twhile (lowerHull.size() >= 2 && area(lowerHull.end()[-2], lowerHull.end()[-1], point) < 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(point);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < lowerHull.size(); ++i) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (p.y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (p.y - hull[hull.size() - 2].y) * (hull.back().x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\thull.pop_back();\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tstd::vector<Point> hull;\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull.back().y) * (hull[hull.size() - 2].x - hull.back().x) < (points[i].x - hull.back().x) * (hull[hull.size() - 2].y - hull.back().y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower convex hull\n\tfor (Point const& point : points) {\n\t\twhile (hull.size() >= 2 && (point.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (point.y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - point.x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(point);\n\t}\n\n\t// Find the upper convex hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) > (points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\n\tfor (Point const& p : points) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 &&\n\t\t\t(p.y - upperHull[upperHull.size() - 2].y) * (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) >\n\t\t\t(upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (p.x - upperHull[upperHull.size() - 2].x)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(p);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 &&\n\t\t\t(p.y - lowerHull[lowerHull.size() - 2].y) * (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) <\n\t\t\t(lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (p.x - lowerHull[lowerHull.size() - 2].x)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = lowerHull.size() - 1; i > 0; i--) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i - 1]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) < 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the leftmost and rightmost points\n\tPoint leftmost = points[0];\n\tPoint rightmost = points[points.size() - 1];\n\n\t// Add the leftmost and rightmost points to the hull\n\thull.push_back(leftmost);\n\thull.push_back(rightmost);\n\n\t// Find the upper and lower tangents of the convex hull\n\tstd::vector<Point> upperTangent;\n\tstd::vector<Point> lowerTangent;\n\n\t// Find the upper tangent\n\tfor (int i = 1; i < points.size(); i++) {\n\t\twhile (upperTangent.size() >= 2 && crossProduct(upperTangent[upperTangent.size() - 2], upperTangent[upperTangent.size() - 1], points[i]) <= 0) {\n\t\t\tupperTangent.pop_back();\n\t\t}\n\t\tupperTangent.push_back(points[i]);\n\t}\n\n\t// Find the lower tangent\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (lowerTangent.size() >= 2 && crossProduct(lowerTangent[lowerTangent.size() - 2], lowerTangent[lowerTangent.size() - 1], points[i]) <= 0) {\n\t\t\tlowerTangent.pop_back();\n\t\t}\n\t\tlowerTangent.push_back(points[i]);\n\t}\n\n\t// Merge the upper and lower tangents into the hull\n\tfor (int i = 0; i < lowerTangent.size(); i++) {\n\t\thull.push_back(lowerTangent[i]);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (points[i].y - hull[hull.size() - 2].y) - (hull[hull.size() - 1].y - hull[hull.size() - 2].y) * (points[i].x - hull[hull.size() - 2].x) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0.0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y);\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(points[i].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - points[i].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\t// Find the lower hull\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Find the upper hull\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(p.y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) <=\n\t\t\t(hull[hull.size() - 1].y - p.y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tfor (int i = points.size() - 1; i >= 0; i--) {\n\t\tPoint const& p = points[i];\n\t\twhile (hull.size() >= 2 && (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (p.x - hull[hull.size() - 1].x) < (hull[hull.size() - 1].x - hull[hull.size() - 2].x) * (p.y - hull[hull.size() - 1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\thull.pop_back();\n\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n\tif (points.size() <= 2) {\n\t\treturn 0;\n\t}\n\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) {\n\t\treturn p1.x < p2.x;\n\t});\n\n\thull.push_back(points[0]);\n\thull.push_back(points[1]);\n\n\tfor (size_t i = 2; i < points.size(); i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t(points[i].y - hull[hull.size() - 2].y) * (hull[hull.size() - 1].x - hull[hull.size() - 2].x) -\n\t\t\t(points[i].x - hull[hull.size() - 2].x) * (hull[hull.size() - 1].y - hull[hull.size() - 2].y) <= 0) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\treturn perimeter;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tsize_t numVertices;\n\tPoint *vertices;\n\tconvexHull(points, numPoints, vertices, numVertices);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0.0;\n\tfor (size_t i = 0; i < numVertices; i++) {\n\t\t*perimeter += distance(vertices[i], vertices[i + 1]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < minX) {\n\t\t\tminX = points[i].x;\n\t\t\tp0 = points[i];\n\t\t}\n\t}\n\n\tstd::vector<Point> hull;\n\thull.push_back(p0);\n\n\tPoint next = p0;\n\tdo {\n\t\tPoint bestPoint = points[0];\n\t\tdouble maxAngle = -1;\n\t\tfor (size_t i = 1; i < numPoints; i++) {\n\t\t\tif (points[i] == next) continue;\n\t\t\tdouble angle = atan2(points[i].y - next.y, points[i].x - next.x);\n\t\t\tif (angle > maxAngle) {\n\t\t\t\tmaxAngle = angle;\n\t\t\t\tbestPoint = points[i];\n\t\t\t}\n\t\t}\n\t\thull.push_back(bestPoint);\n\t\tnext = bestPoint;\n\t} while (next != p0);\n\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tstd::vector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tPoint p0 = points[0];\n\tdouble minX = p0.x, maxX = p0.x;\n\tdouble minY = p0.y, maxY = p0.y;\n\tfor (size_t i = 1; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tminX = min(minX, p.x);\n\t\tmaxX = max(maxX, p.x);\n\t\tminY = min(minY, p.y);\n\t\tmaxY = max(maxY, p.y);\n\t}\n\n\tvector<Point> upper, lower;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\tPoint p = points[i];\n\t\tif (p.y == minY) {\n\t\t\tlower.push_back(p);\n\t\t}\n\t\tif (p.y == maxY) {\n\t\t\tupper.push_back(p);\n\t\t}\n\t}\n\n\tsort(lower.begin(), lower.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tsort(upper.begin(), upper.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\tvector<Point> ch;\n\tch.push_back(lower[0]);\n\tfor (size_t i = 1; i < lower.size(); i++) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (lower[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (lower[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(lower[i]);\n\t}\n\n\tfor (size_t i = upper.size() - 1; i >= 0; i--) {\n\t\twhile (ch.size() >= 2 && (ch.back().y - ch[ch.size() - 2].y) * (upper[i].x - ch.back().x) < (ch.back().x - ch[ch.size() - 2].x) * (upper[i].y - ch.back().y)) {\n\t\t\tch.pop_back();\n\t\t}\n\t\tch.push_back(upper[i]);\n\t}\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < ch.size(); i++) {\n\t\tperimeter += distance(ch[i], ch[(i + 1) % ch.size()]);\n\t}\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x;\n\n\tPoint p1 = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint p2;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) continue;\n\t\tPoint p3 = points[i];\n\t\tdouble dist = distance(p1, p3);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tp2 = p3;\n\t\t}\n\t}\n\n\t__shared__ double sharedMinDist[numBlocks];\n\t__shared__ Point sharedP2[numBlocks];\n\n\tsharedMinDist[bid] = minDist;\n\tsharedP2[bid] = p2;\n\n\t__syncthreads();\n\n\tdouble globalMinDist = DBL_MAX;\n\tPoint globalP2;\n\tfor (int i = 0; i < numBlocks; i++) {\n\t\tif (sharedMinDist[i] < globalMinDist) {\n\t\t\tglobalMinDist = sharedMinDist[i];\n\t\t\tglobalP2 = sharedP2[i];\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*perimeter = 2 * globalMinDist + distance(p1, globalP2);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; ++i) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tstd::vector<Point> hull;\n\thull.push_back(points[leftmost]);\n\n\t// Find the convex hull using Graham's scan\n\tint p = leftmost;\n\tdo {\n\t\tint q = (p + 1) % numPoints;\n\t\tfor (int i = 0; i < numPoints; ++i) {\n\t\t\tif (i == p || i == q) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif (orientation(points[p], points[q], points[i]) == -1) {\n\t\t\t\tq = i;\n\t\t\t}\n\t\t}\n\t\thull.push_back(points[q]);\n\t\tp = q;\n\t} while (p != leftmost);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the leftmost point\n\tint leftmost = 0;\n\tfor (int i = 1; i < numPoints; i++) {\n\t\tif (points[i].x < points[leftmost].x) {\n\t\t\tleftmost = i;\n\t\t}\n\t}\n\n\t// Initialize the convex hull with the leftmost point\n\tvector<Point> hull = { points[leftmost] };\n\n\t// Find the next point to add to the convex hull\n\twhile (true) {\n\t\t// Find the point that is farthest from the line connecting the last two points in the convex hull\n\t\tint farthest = -1;\n\t\tdouble maxDistance = 0;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\t// Skip points that are already in the convex hull\n\t\t\tif (find(hull.begin(), hull.end(), points[i]) != hull.end()) {\n\t\t\t\tcontinue;\n\t\t\t}\n\n\t\t\t// Compute the distance from the point to the line connecting the last two points in the convex hull\n\t\t\tPoint p1 = hull[hull.size() - 2];\n\t\t\tPoint p2 = hull[hull.size() - 1];\n\t\t\tdouble distance = fabs((p2.y - p1.y) * points[i].x - (p2.x - p1.x) * points[i].y + p2.x * p1.y - p2.y * p1.x) / sqrt(pow(p2.y - p1.y, 2) + pow(p2.x - p1.x, 2));\n\n\t\t\t// If the distance is greater than the maximum distance so far, update the farthest point and the maximum distance\n\t\t\tif (distance > maxDistance) {\n\t\t\t\tfarthest = i;\n\t\t\t\tmaxDistance = distance;\n\t\t\t}\n\t\t}\n\n\t\t// If no point is farther from the line connecting the last two points in the convex hull, the convex hull is complete\n\t\tif (farthest == -1) {\n\t\t\tbreak;\n\t\t}\n\n\t\t// Add the farthest point to the convex hull\n\t\thull.push_back(points[farthest]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[i + 1]);\n\t}\n\n\t// Store the perimeter in the output variable\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && (upperHull[upperHull.size() - 1].y - upperHull[upperHull.size() - 2].y) * (points[i].x - upperHull[upperHull.size() - 2].x) < (upperHull[upperHull.size() - 1].x - upperHull[upperHull.size() - 2].x) * (points[i].y - upperHull[upperHull.size() - 2].y)) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && (lowerHull[lowerHull.size() - 1].y - lowerHull[lowerHull.size() - 2].y) * (points[i].x - lowerHull[lowerHull.size() - 2].x) > (lowerHull[lowerHull.size() - 1].x - lowerHull[lowerHull.size() - 2].x) * (points[i].y - lowerHull[lowerHull.size() - 2].y)) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using the Graham scan algorithm.\n\tvector<Point> hull;\n\tfor (size_t i = 0; i < numPoints; ++i) {\n\t    Point p = points[i];\n\t    while (hull.size() >= 2 && crossProduct(hull[hull.size() - 2], hull.back(), p) <= 0) {\n\t        hull.pop_back();\n\t    }\n\t    hull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t    perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Sort the points by their x-coordinate\n\tstd::sort(points, points + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull, lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\t// Add the point to the upper hull if it is above the current upper hull\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], points[i]) < 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(points[i]);\n\n\t\t// Add the point to the lower hull if it is below the current lower hull\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], points[i]) > 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\t*perimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\t*perimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points\n\tPoint *hull = new Point[numPoints];\n\tint hullSize = convexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < hullSize; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hullSize]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[points.size() - 1]);\n\tfor (int i = 1; i < points.size() - 1; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::vector<int> hullIndices;\n\thullIndices.reserve(points.size());\n\tgrahamScan(points, convexHull, hullIndices);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint nextIndex = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[nextIndex]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2) {\n\t\t\tPoint p1 = convexHull[convexHull.size() - 2];\n\t\t\tPoint p2 = convexHull[convexHull.size() - 1];\n\t\t\tPoint p3 = points[i];\n\t\t\tdouble cross = (p2.x - p1.x) * (p3.y - p2.y) - (p2.y - p1.y) * (p3.x - p2.x);\n\t\t\tif (cross > 0) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = points.size() - 1;\n\twhile (left < right) {\n\t\tconvexHull.push_back(points[left++]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tright--;\n\t\t}\n\t\tconvexHull.push_back(points[right--]);\n\t\twhile (left < right && (convexHull.back().y - points[left].y) * (convexHull.back().x - points[right].x) <= (convexHull.back().y - points[right].y) * (convexHull.back().x - points[left].x)) {\n\t\t\tleft++;\n\t\t}\n\t}\n\tconvexHull.push_back(points[left]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Send the convex hull to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Gather(&convexHull[0], convexHull.size(), MPI_DOUBLE, &globalConvexHull[0], globalConvexHull.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tconvexHull.push_back(points[0]);\n\tfor (size_t i = 1; i < points.size(); ++i) {\n\t\t// Find the first point in the convex hull that is to the left of the line through the last two points\n\t\tsize_t j = convexHull.size() - 1;\n\t\twhile (j > 0 && (convexHull[j].x - convexHull[j-1].x) * (points[i].y - convexHull[j].y) -\n\t\t\t(convexHull[j].y - convexHull[j-1].y) * (points[i].x - convexHull[j].x) < 0) {\n\t\t\t--j;\n\t\t}\n\t\t// Add the point to the convex hull\n\t\tconvexHull.insert(convexHull.begin() + j + 1, points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI::COMM_WORLD.Reduce(&perimeter, &globalPerimeter, 1, MPI::DOUBLE, MPI::SUM, 0);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeters from all ranks to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the global perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the total perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty()) {\n\t\t\tconvexHull.push_back(point);\n\t\t} else {\n\t\t\twhile (convexHull.size() >= 2 &&\n\t\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) *\n\t\t\t\t\t(point.y - convexHull[convexHull.size() - 1].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t(point.x - convexHull[convexHull.size() - 1].x) < 0) {\n\t\t\t\tconvexHull.pop_back();\n\t\t\t}\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Calculate the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t   (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t\t   (point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t\t   (point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Find the total perimeter of the convex hulls on all ranks\n\tdouble totalPerimeter = 0;\n\tfor (int i = 0; i < size; i++) {\n\t\tif (i == rank) {\n\t\t\tfor (int j = 0; j < convexHull.size(); j++) {\n\t\t\t\ttotalPerimeter += distance(convexHull[j], convexHull[(j + 1) % convexHull.size()]);\n\t\t\t}\n\t\t}\n\t\tMPI_Reduce(&totalPerimeter, &totalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\t}\n\n\t// Return the total perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn totalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Find the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (auto const& point : points) {\n\t\tif (convexHull.empty() || point.x != convexHull.back().x || point.y != convexHull.back().y) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t}\n\n\t// Reduce the convex hulls from all ranks to rank 0\n\tstd::vector<Point> globalConvexHull;\n\tMPI_Reduce(&convexHull[0], &globalConvexHull[0], convexHull.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Compute the perimeter of the global convex hull on rank 0\n\tif (rank == 0) {\n\t\tdouble perimeter = 0;\n\t\tfor (size_t i = 0; i < globalConvexHull.size(); i++) {\n\t\t\tperimeter += distance(globalConvexHull[i], globalConvexHull[(i + 1) % globalConvexHull.size()]);\n\t\t}\n\t\treturn perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (auto it = points.begin(); it != points.end(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\tfor (auto it = points.rbegin(); it != points.rend(); ++it) {\n\t\twhile (convexHull.size() >= 2 && (it->y - convexHull[convexHull.size() - 2].y) * (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) < (it->x - convexHull[convexHull.size() - 2].x) * (convexHull[convexHull.size() - 1].y - convexHull[convexHull.size() - 2].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(*it);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tfor (int i = 1; i < points.size(); i++) {\n\t\tint next = (i + 1) % points.size();\n\t\tif (std::abs(points[i].x - points[next].x) < 1e-9 && std::abs(points[i].y - points[next].y) < 1e-9) {\n\t\t\tcontinue;\n\t\t}\n\t\twhile (convexHull.size() >= 2 && (points[next] - convexHull.back()).cross(points[i] - convexHull.back()) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = (i + 1) % convexHull.size();\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& p : points) {\n\t}\n\n\t// Compute the total perimeter\n\tdouble totalPerimeter = 0.0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t}\n\n\t// Reduce the total perimeter to rank 0\n\tdouble finalPerimeter;\n\tMPI_Reduce(&totalPerimeter, &finalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final perimeter on rank 0\n\tif (rank == 0) {\n\t}\n}\n",
+            "Hull: 1.  Synag",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tfor (Point const& point : points) {\n\t\tif (convexHull.empty() || convexHull.back() != point) {\n\t\t\tconvexHull.push_back(point);\n\t\t}\n\t\twhile (convexHull.size() >= 3 &&\n\t\t\t(convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 3].x) *\n\t\t\t\t(point.y - convexHull[convexHull.size() - 2].y) -\n\t\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 3].y) *\n\t\t\t\t(point.x - convexHull[convexHull.size() - 2].x) < 0) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n\tdouble x, y;\n};\n\n__device__ double distance(Point const& p1, Point const& p2) {\n\treturn sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n}\n\n/* Compute the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Store the result in perimeter.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as points.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\n__global__ void convexHullPerimeter(const Point *points, size_t numPoints, double *perimeter) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint numBlocks = gridDim.x;\n\tint numThreads = blockDim.x;\n\tint start = tid + bid * numThreads;\n\tdouble localPerimeter = 0;\n\tfor (int i = start; i < numPoints - 1; i++) {\n\t\tlocalPerimeter += distance(points[i], points[i + 1]);\n\t}\n\tlocalPerimeter += distance(points[numPoints - 1], points[0]);\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tdouble globalPerimeter = 0;\n\t\tfor (int i = 0; i < numBlocks; i++) {\n\t\t\tdouble blockPerimeter;\n\t\t\tcudaMemcpy(&blockPerimeter, &localPerimeter, sizeof(double), cudaMemcpyDeviceToHost);\n\t\t\tglobalPerimeter += blockPerimeter;\n\t\t}\n\t\t*perimeter = globalPerimeter;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tPoint *convexHull;\n\tsize_t convexHullSize;\n\tconvexHull2(points, numPoints, &convexHull, &convexHullSize);\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < convexHullSize; i++) {\n\t\t*perimeter += distance(convexHull[i], convexHull[(i + 1) % convexHullSize]);\n\t}\n\tfree(convexHull);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid >= numPoints) {\n\t\treturn;\n\t}\n\tPoint p = points[tid];\n\tdouble minDist = DBL_MAX;\n\tPoint closestPoint;\n\tfor (int i = 0; i < numPoints; i++) {\n\t\tif (i == tid) {\n\t\t\tcontinue;\n\t\t}\n\t\tPoint q = points[i];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tclosestPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tthrust::copy(points, points + numPoints, sortedPoints);\n\tthrust::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && cross(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && cross(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size(); i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[(i + 1) % upperHull.size()]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size(); i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[(i + 1) % lowerHull.size()]);\n\t}\n\n\t// Store the result in perimeter\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\tif (i >= numPoints) return;\n\n\tPoint p = points[i];\n\tdouble minDist = DBL_MAX;\n\tPoint minPoint;\n\tfor (int j = 0; j < numPoints; j++) {\n\t\tif (i == j) continue;\n\t\tPoint q = points[j];\n\t\tdouble dist = distance(p, q);\n\t\tif (dist < minDist) {\n\t\t\tminDist = dist;\n\t\t\tminPoint = q;\n\t\t}\n\t}\n\t*perimeter += minDist;\n}\n",
+            "Hull.2)",
+            "\n\n\t// Compute the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in perimeter.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Sort the points by their x-coordinates\n\tPoint *sortedPoints = new Point[numPoints];\n\tmemcpy(sortedPoints, points, numPoints * sizeof(Point));\n\tstd::sort(sortedPoints, sortedPoints + numPoints, [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\n\t// Find the upper and lower hulls\n\tstd::vector<Point> upperHull;\n\tstd::vector<Point> lowerHull;\n\tfor (size_t i = 0; i < numPoints; i++) {\n\t\twhile (upperHull.size() >= 2 && crossProduct(upperHull[upperHull.size() - 2], upperHull[upperHull.size() - 1], sortedPoints[i]) <= 0) {\n\t\t\tupperHull.pop_back();\n\t\t}\n\t\tupperHull.push_back(sortedPoints[i]);\n\t\twhile (lowerHull.size() >= 2 && crossProduct(lowerHull[lowerHull.size() - 2], lowerHull[lowerHull.size() - 1], sortedPoints[i]) >= 0) {\n\t\t\tlowerHull.pop_back();\n\t\t}\n\t\tlowerHull.push_back(sortedPoints[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < upperHull.size() - 1; i++) {\n\t\tperimeter += distance(upperHull[i], upperHull[i + 1]);\n\t}\n\tfor (size_t i = 0; i < lowerHull.size() - 1; i++) {\n\t\tperimeter += distance(lowerHull[i], lowerHull[i + 1]);\n\t}\n\tperimeter += distance(upperHull[upperHull.size() - 1], lowerHull[lowerHull.size() - 1]);\n\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm\n\tvector<Point> hull;\n\tgrahamScan(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull\n\t*perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\t*perimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tvector<Point> hull;\n\tconvexHull(points, numPoints, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (size_t i = 0; i < hull.size(); i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Store the result in the output variable.\n\t*perimeter = perimeter;\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < numPoints) {\n\t\tPoint p = points[tid];\n\t\tdouble minDist = DBL_MAX;\n\t\tPoint closestPoint;\n\t\tfor (int i = 0; i < numPoints; i++) {\n\t\t\tif (i == tid) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tPoint q = points[i];\n\t\t\tdouble dist = distance(p, q);\n\t\t\tif (dist < minDist) {\n\t\t\t\tminDist = dist;\n\t\t\t\tclosestPoint = q;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(perimeter, minDist);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "geometry",
+        "language": "cpp",
+        "name": "11_geometry_convex_hull_perimeter",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The final result is returned on rank 0.\n   Example:\n\n   input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n   output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector<Point> const& points) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.push_back(points[0]);\n\tconvexHull.push_back(points[1]);\n\tfor (int i = 2; i < points.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == points.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tint prev = i - 1;\n\t\tif (prev == -1) {\n\t\t\tprev = points.size() - 1;\n\t\t}\n\t\tdouble area = (points[next].x - points[prev].x) * (points[i].y - points[prev].y) - (points[next].x - points[prev].x) * (points[i].y - points[next].y);\n\t\tif (area >= 0) {\n\t\t\tconvexHull.push_back(points[i]);\n\t\t}\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tint next = i + 1;\n\t\tif (next == convexHull.size()) {\n\t\t\tnext = 0;\n\t\t}\n\t\tperimeter += distance(convexHull[i], convexHull[next]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (int i = 0; i < points.size(); i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = points.size() - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tif (n <= 2) {\n\t\tconvexHull = points;\n\t} else {\n\t\tint m = 0;\n\t\tfor (int i = 0; i < n; i++) {\n\t\t\twhile (m >= 2 && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tfor (int i = n-2, t = m+1; i >= 0; i--) {\n\t\t\twhile (m >= t && (points[i].x - convexHull[m-1].x) * (convexHull[m-2].y - convexHull[m-1].y) - (points[i].y - convexHull[m-1].y) * (convexHull[m-2].x - convexHull[m-1].x) < 0) {\n\t\t\t\tm--;\n\t\t\t}\n\t\t\tconvexHull[m++] = points[i];\n\t\t}\n\t\tconvexHull.resize(m);\n\t}\n\n\t// Compute the perimeter of the convex hull on each rank\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham scan algorithm\n\tstd::vector<Point> convexHull = grahamScan(points);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the final result on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t(convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t(convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Compute the convex hull of the points using Graham scan\n\tstd::vector<Point> hull;\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x || (p1.x == p2.x && p1.y < p2.y); });\n\tfor (Point const& p : points) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\tint n = hull.size();\n\tfor (int i = n-2; i >= 0; i--) {\n\t\twhile (hull.size() >= 2 && (hull[hull.size()-2].y - hull[hull.size()-1].y) * (p.x - hull[hull.size()-1].x) < (hull[hull.size()-1].x - hull[hull.size()-2].x) * (p.y - hull[hull.size()-1].y)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(p);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < n; i++) {\n\t\tperimeter += distance(hull[i], hull[(i+1)%n]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < hull.size(); ++i) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % hull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points\n\tstd::vector<Point> hull;\n\tconvexHull(points, hull);\n\n\t// Compute the perimeter of the hull in parallel\n\tdouble perimeter = 0.0;\n\t#pragma omp parallel for reduction(+:perimeter)\n\tfor (int i = 0; i < hull.size(); ++i) {\n\t\tint next = (i + 1) % hull.size();\n\t\tperimeter += distance(hull[i], hull[next]);\n\t}\n\n\t// Return the perimeter\n\treturn perimeter;\n}\n\n",
+            "\n\n\tint const rank = 0;\n\tint const size = MPI::COMM_WORLD.Get_size();\n\tint const points_per_rank = points.size() / size;\n\tint const remainder = points.size() % size;\n\n\tstd::vector<Point> local_points = points;\n\tlocal_points.resize(points_per_rank + (rank < remainder));\n\n\tstd::vector<Point> convex_hull;\n\tconvexHull(local_points, convex_hull);\n\n\tdouble local_perimeter = 0;\n\tfor (size_t i = 0; i < convex_hull.size(); ++i) {\n\t\tlocal_perimeter += distance(convex_hull[i], convex_hull[(i+1)%convex_hull.size()]);\n\t}\n\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Reduce(&local_perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, rank);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint n = points.size();\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[i].x - convexHull[convexHull.size() - 1].x) <= (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) * (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint left = 0, right = 0;\n\twhile (left < points.size()) {\n\t\twhile (right < points.size() && points[right].x == points[left].x) {\n\t\t\tright++;\n\t\t}\n\t\tint n = right - left;\n\t\tif (n > 1) {\n\t\t\tstd::sort(points.begin() + left, points.begin() + right, [](Point const& p1, Point const& p2) { return p1.y < p2.y; });\n\t\t\tfor (int i = 0; i < n; i++) {\n\t\t\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) * (points[left + i].x - convexHull[convexHull.size() - 1].x) < (convexHull[convexHull.size() - 2].x - convexHull[convexHull.size() - 1].x) * (points[left + i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\t\t\tconvexHull.pop_back();\n\t\t\t\t}\n\t\t\t\tconvexHull.push_back(points[left + i]);\n\t\t\t}\n\t\t}\n\t\tleft = right;\n\t}\n\tconvexHull.push_back(convexHull[0]);\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size() - 1; i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[i + 1]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = points.size();\n\tstd::vector<Point> hull;\n\n\t// Find the convex hull of the points using Graham's scan.\n\tfor (int i = 0; i < n; i++) {\n\t\twhile (hull.size() >= 2 &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\tint m = hull.size();\n\tfor (int i = n - 2; i >= 0; i--) {\n\t\twhile (hull.size() >= m &&\n\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (points[i].x - hull[hull.size() - 1].x) <\n\t\t\t\t   (hull[hull.size() - 2].y - hull[hull.size() - 1].y) * (hull[hull.size() - 2].x - hull[hull.size() - 1].x)) {\n\t\t\thull.pop_back();\n\t\t}\n\t\thull.push_back(points[i]);\n\t}\n\thull.pop_back();\n\n\t// Compute the perimeter of the convex hull in parallel.\n\tdouble perimeter = 0;\n\t#pragma omp parallel for reduction(+ : perimeter)\n\tfor (int i = 0; i < m; i++) {\n\t\tperimeter += distance(hull[i], hull[(i + 1) % m]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble global_perimeter;\n\tMPI_Reduce(&perimeter, &global_perimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn -1;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tfor (Point const& p : points) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (p.x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (p.y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(p);\n\t}\n\tconvexHull.pop_back();\n\tfor (int i = points.size()-2; i >= 0; i--) {\n\t\twhile (convexHull.size() >= 2 && (convexHull[convexHull.size()-2].y - convexHull[convexHull.size()-1].y) * (points[i].x - convexHull[convexHull.size()-1].x) < (convexHull[convexHull.size()-1].x - convexHull[convexHull.size()-2].x) * (points[i].y - convexHull[convexHull.size()-1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\tconvexHull.pop_back();\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); i++) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i+1)%convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convexHull;\n\tconvexHull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tint start = rank * points.size() / size;\n\tint end = (rank + 1) * points.size() / size;\n\tfor (int i = start; i < end; ++i) {\n\t\twhile (convexHull.size() >= 2 &&\n\t\t\t   (convexHull[convexHull.size() - 2].y - convexHull[convexHull.size() - 1].y) *\n\t\t\t\t\t   (points[i].x - convexHull[convexHull.size() - 1].x) <\n\t\t\t\t   (convexHull[convexHull.size() - 1].x - convexHull[convexHull.size() - 2].x) *\n\t\t\t\t\t   (points[i].y - convexHull[convexHull.size() - 1].y)) {\n\t\t\tconvexHull.pop_back();\n\t\t}\n\t\tconvexHull.push_back(points[i]);\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\tif (rank == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan algorithm.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\t// Find the convex hull of the points using Graham's scan.\n\tstd::vector<Point> convexHull;\n\tgrahamScan(points, convexHull);\n\n\t// Compute the perimeter of the convex hull.\n\tdouble perimeter = 0.0;\n\tfor (size_t i = 0; i < convexHull.size(); ++i) {\n\t\tperimeter += distance(convexHull[i], convexHull[(i + 1) % convexHull.size()]);\n\t}\n\n\t// Reduce the perimeter to rank 0.\n\tdouble globalPerimeter;\n\tMPI_Reduce(&perimeter, &globalPerimeter, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n\t// Return the perimeter on rank 0.\n\tif (MPI::COMM_WORLD.Get_rank() == 0) {\n\t\treturn globalPerimeter;\n\t} else {\n\t\treturn 0.0;\n\t}\n}\n",
+            "\n\n\tint const rank = MPI::COMM_WORLD.Get_rank();\n\tint const size = MPI::COMM_WORLD.Get_size();\n\n\t// Compute the convex hull of the points on each rank\n\tstd::vector<Point> convex_hull;\n\tconvex_hull.reserve(points.size());\n\tstd::sort(points.begin(), points.end(), [](Point const& p1, Point const& p2) { return p1.x < p2.x; });\n\tconvex_hull.push_back(points[0]);\n\tconvex_hull.push_back(points[1]);\n\tfor (size_t i = 2; i < points.size(); ++i) {\n\t\twhile (convex_hull.size() >= 2 &&\n\t\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - points[i].y) -\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - points[i].x) *\n\t\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t\t   0) {\n\t\t\tconvex_hull.pop_back();\n\t\t}\n\t\tconvex_hull.push_back(points[i]);\n\t}\n\tint const convex_hull_size = convex_hull.size();\n\twhile (convex_hull.size() >= 2 &&\n\t\t   (convex_hull[convex_hull.size() - 2].x - convex_hull[convex_hull.size() - 1].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].y - convex_hull[0].y) -\n\t\t\t\t   (convex_hull[convex_hull.size() - 1].x - convex_hull[0].x) *\n\t\t\t\t   (convex_hull[convex_hull.size() - 2].y - convex_hull[convex_hull.size() - 1].y) <\n\t\t\t   0) {\n\t\tconvex_hull.pop_back();\n\t}\n\n\t// Compute the perimeter of the convex hull\n\tdouble perimeter = 0;\n\tfor (int i = 0; i < convex_hull_size; ++i) {\n\t\tperimeter += distance(convex_hull[i], convex_hull[(i + 1) % convex_hull_size]);\n\t}\n\n\t// Reduce the perimeter to rank 0\n\tdouble global_perimeter;\n\tMPI::COMM_WORLD.Allreduce(&perimeter, &global_perimeter, 1, MPI::DOUBLE, MPI::SUM);\n\n\tif (rank == 0) {\n\t\treturn global_perimeter;\n\t} else {\n\t\treturn 0;\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &size) {\n        if (visited(i) == 0) {\n          size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices.\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices.\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0.\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component.\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size if necessary.\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &componentSize) {\n        if (visited[i] == 0) {\n          componentSize++;\n          visited[i] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[i][j] == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          }\n        }\n      });\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n  return maxComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      depthFirstSearch(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  // Initialize the component size array to 0 for all vertices\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize.fill(0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      while (!s.empty()) {\n        size_t v = s.top();\n        s.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A(i, j) && !visited[j]) {\n              s.push(j);\n            }\n          }\n        }\n      }\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited[i] == 0) {\n          component_size++;\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              visited[j] = 1;\n            }\n          });\n        }\n      }, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Initialize the visited array to false for all vertices\n  View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0 for all vertices\n  View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited(i)) {\n      int componentSize = 0;\n      DFS(A, i, visited, componentSize);\n\n      // Update the largest component size if necessary\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  return largestComponentSize;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"largestComponent\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        // Perform a depth-first search to count the number of vertices in the\n        // connected component containing vertex i.\n        std::vector<bool> visited(N, false);\n        int component_size = 0;\n        std::stack<int> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          int v = stack.top();\n          stack.pop();\n          if (!visited[v]) {\n            visited[v] = true;\n            component_size++;\n            for (int j = 0; j < N; j++) {\n              if (A(i, j) && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n  return max_component;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n\n  // Initialize the component size array to 0\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  componentSize() = 0;\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited(i)) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  });\n\n  return largestComponentSize;\n}\n",
+            "Kokok",
+            "\n\n  // Create a vector to store the size of each component.\n  Kokkos::View<int*> component_sizes(\"component_sizes\", N);\n\n  // Initialize the component sizes to 1.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    component_sizes[i] = 1;\n  });\n\n  // Iterate over the adjacency matrix.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      // If there is an edge between i and j, then they are in the same component.\n      if (A(i, j) == 1) {\n        // Find the smaller component size.\n        int smaller_component_size = std::min(component_sizes[i], component_sizes[j]);\n\n        // Update the component sizes.\n        Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int k) {\n          if (component_sizes[k] == smaller_component_size) {\n            component_sizes[k] = std::max(component_sizes[i], component_sizes[j]);\n          }\n        });\n      }\n    }\n  });\n\n  // Find the largest component size.\n  int largest_component_size = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &largest_component_size) {\n    largest_component_size = std::max(largest_component_size, component_sizes[i]);\n  });\n\n  return largest_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component_size = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int &j, int &local_component_size) {\n        if (A(i, j) == 1) {\n          local_component_size += 1;\n        }\n      }, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n      visited(i) = 1;\n    }\n  });\n  return max_component_size;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t j, int &component_size) {\n        if (visited(j) == 0 && A(i, j) == 1) {\n          component_size++;\n          visited(j) = 1;\n        }\n      });\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  int max_component = 0;\n  Kokkos::parallel_reduce(\n      \"Largest Component\",\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_component) {\n        int component_size = 0;\n        std::vector<bool> visited(N, false);\n        std::queue<int> q;\n        q.push(i);\n        visited[i] = true;\n        while (!q.empty()) {\n          int current = q.front();\n          q.pop();\n          component_size++;\n          for (int j = 0; j < N; j++) {\n            if (A(current, j) && !visited[j]) {\n              q.push(j);\n              visited[j] = true;\n            }\n          }\n        }\n        max_component = std::max(max_component, component_size);\n      },\n      max_component);\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      int component_size = 0;\n      Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &component_size) {\n        if (visited(i) == 0) {\n          component_size += 1;\n          visited(i) = 1;\n          for (size_t j = 0; j < N; ++j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              visited(j) = 1;\n            }\n          }\n        }\n      });\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::TeamPolicy;\n  using Kokkos::Work;\n\n  // Initialize the visited array\n  Kokkos::View<bool**> visited(\"visited\", N, N);\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) { visited(i, j) = false; }\n  });\n\n  // Perform DFS on each unvisited node\n  int max_component = 0;\n  parallel_for(TeamPolicy(Kokkos::DefaultExecutionSpace(), N), KOKKOS_LAMBDA(const int i) {\n    if (!visited(i, i)) {\n      int component_size = 0;\n      dfs(i, i, visited, A, component_size);\n      max_component = std::max(max_component, component_size);\n    }\n  });\n\n  return max_component;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> componentSize(\"componentSize\", N);\n  Kokkos::View<int*> componentId(\"componentId\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  int maxComponentSize = 0;\n  int componentCount = 0;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (visited(i) == 0) {\n      int start = 0;\n      int end = 0;\n      queue(end++) = i;\n      visited(i) = 1;\n      componentId(i) = componentCount;\n      while (start != end) {\n        int current = queue(start++);\n        componentSize(componentId(current))++;\n        for (int j = 0; j < N; j++) {\n          if (A(current, j) == 1 && visited(j) == 0) {\n            queue(end++) = j;\n            visited(j) = 1;\n            componentId(j) = componentCount;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize(componentCount));\n      componentCount++;\n    }\n  });\n\n  return maxComponentSize;\n}\n",
+            "\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited.fill(false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(Kokkos::View<const int**> &A, size_t i, Kokkos::View<bool*> &visited, int &component_size) {\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.extent(0); j++) {\n    if (A(i, j) && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n\n    // If the vertex has not been visited, perform a depth-first search to find the size of its component\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack\n      while (!stack.empty()) {\n        // Pop the next vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the component\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n\n          // Push all of the vertex's neighbors onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1 && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the largest component size if the current component is larger\n      if (componentSize > largestComponentSize) {\n        largestComponentSize = componentSize;\n      }\n\n      // Update the component size array with the size of the current component\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n  #pragma omp parallel for reduction(max:max_component_size)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component_size;\n}\n",
+            "\n\n  // Create a vector to store the visited nodes.\n  std::vector<bool> visited(N, false);\n\n  // Create a vector to store the component sizes.\n  std::vector<int> componentSizes(N, 0);\n\n  // Create a parallel region.\n  #pragma omp parallel\n  {\n    // Loop over the nodes.\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      // If the node has not been visited, perform a depth-first search to find all the nodes in its component.\n      if (! visited[i]) {\n        int componentSize = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (! stack.empty()) {\n          size_t node = stack.top();\n          stack.pop();\n          if (! visited[node]) {\n            componentSize++;\n            for (size_t j = 0; j < N; j++) {\n              if (A[node * N + j] == 1) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        componentSizes[i] = componentSize;\n      }\n    }\n  }\n\n  // Find the largest component size.\n  int largestComponentSize = *std::max_element(componentSizes.begin(), componentSizes.end());\n\n  // Return the largest component size.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<int> visited(N, 0);\n  int largest_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_largest_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        int component_size = 0;\n        std::stack<size_t> stack;\n        stack.push(i);\n        while (!stack.empty()) {\n          size_t vertex = stack.top();\n          stack.pop();\n          if (!visited[vertex]) {\n            ++component_size;\n            visited[vertex] = 1;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[vertex][j] && !visited[j]) {\n                stack.push(j);\n              }\n            }\n          }\n        }\n        local_largest_component_size = std::max(local_largest_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    largest_component_size = std::max(largest_component_size, local_largest_component_size);\n  }\n  return largest_component_size;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int* componentSize = new int[N];\n  int maxComponentSize = 0;\n  int componentCount = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, i, visited, componentSize, size);\n      componentCount++;\n      maxComponentSize = std::max(maxComponentSize, size);\n    }\n  }\n  delete[] visited;\n  delete[] componentSize;\n  return maxComponentSize;\n}\n",
+            "\n\n  int max_size = 0;\n  int max_component = -1;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = dfs(A, i, visited);\n      if (size > max_size) {\n        max_size = size;\n        max_component = i;\n      }\n    }\n  }\n  delete[] visited;\n  return max_size;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max:max_component)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n\n  return max_component;\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n",
+            "\n\n  int N = A.size();\n  std::vector<int> component(N, -1);\n  int maxComponent = 0;\n  int currentComponent = 0;\n  #pragma omp parallel for reduction(max:maxComponent) reduction(+:currentComponent)\n  for (int i = 0; i < N; i++) {\n    if (component[i] == -1) {\n      currentComponent = 0;\n      dfs(A, i, component, currentComponent);\n      maxComponent = std::max(maxComponent, currentComponent);\n    }\n  }\n  return maxComponent;\n}\n\n",
+            "\n\n  int max_component_size = 0;\n  int component_size = 0;\n  int visited[N];\n  memset(visited, 0, sizeof(visited));\n\n  #pragma omp parallel for reduction(max: max_component_size)\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      component_size = 0;\n      dfs(i, A, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(int i, std::vector<int> const& A, int visited[], int& component_size) {\n  visited[i] = 1;\n  component_size++;\n  for (int j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1 && visited[j] == 0) {\n      dfs(j, A, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  int max_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  visited[i] = true;\n  component_size++;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While the stack is not empty, pop a vertex from the stack and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    componentID[i] = i;\n  }\n\n  // Iterate over the adjacency matrix.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // If there is an edge between vertex i and vertex j, then they are in the same component.\n      if (A[i][j] == 1) {\n        // Find the component ID for vertex i.\n        int iComponentID = componentID[i];\n\n        // Find the component ID for vertex j.\n        int jComponentID = componentID[j];\n\n        // If the component IDs are different, then merge the two components.\n        if (iComponentID != jComponentID) {\n          // Find the smaller component ID.\n          int smallerComponentID = std::min(iComponentID, jComponentID);\n\n          // Find the larger component ID.\n          int largerComponentID = std::max(iComponentID, jComponentID);\n\n          // Update the component ID for each vertex in the smaller component.\n          #pragma omp parallel for\n          for (size_t k = 0; k < N; k++) {\n            if (componentID[k] == smallerComponentID) {\n              componentID[k] = largerComponentID;\n            }\n          }\n\n          // Update the size of the larger component.\n          componentSize[largerComponentID] += componentSize[smallerComponentID];\n        }\n      }\n    }\n  }\n\n  // Find the largest component.\n  int largestComponentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    largestComponentSize = std::max(largestComponentSize, componentSize[i]);\n  }\n\n  // Return the size of the largest component.\n  return largestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array to 0 for all vertices\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size to 0\n  int largestComponentSize = 0;\n\n  // Iterate over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to find the size of the component containing the vertex\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // While there are vertices in the stack, pop a vertex and visit all its unvisited neighbors\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        componentSize++;\n\n        // Iterate over all neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is unvisited and there is an edge between the vertex and the neighbor, push the neighbor onto the stack and mark it as visited\n          if (!visited[j] && A[vertex][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Update the largest component size if the current component size is larger\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n",
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int size = dfs(A, i, visited, componentSize);\n      nComponents++;\n      if (size > componentSize[0]) {\n        componentSize[0] = size;\n      }\n    }\n  }\n  return componentSize[0];\n}\n",
+            "\n\n  int max_component = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  delete[] visited;\n  return max_component;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; ++i) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int x, int y) {\n    int rootX = find(x);\n    int rootY = find(y);\n    if (rootX != rootY) {\n      if (size[rootX] < size[rootY]) {\n        parent[rootX] = rootY;\n        size[rootY] += size[rootX];\n      } else {\n        parent[rootY] = rootX;\n        size[rootX] += size[rootY];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> parent(N);\n  std::vector<int> size(N);\n  for (size_t i = 0; i < N; i++) {\n    parent[i] = i;\n    size[i] = 1;\n  }\n\n  auto find = [&](int x) {\n    if (parent[x] != x) {\n      parent[x] = find(parent[x]);\n    }\n    return parent[x];\n  };\n\n  auto union_ = [&](int a, int b) {\n    int rootA = find(a);\n    int rootB = find(b);\n    if (rootA != rootB) {\n      if (size[rootA] > size[rootB]) {\n        parent[rootB] = rootA;\n        size[rootA] += size[rootB];\n      } else {\n        parent[rootA] = rootB;\n        size[rootB] += size[rootA];\n      }\n    }\n  };\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i][j] == 1) {\n        union_(i, j);\n      }\n    }\n  }\n\n  int maxSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    maxSize = std::max(maxSize, size[find(i)]);\n  }\n\n  return maxSize;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n\n  visited[i] = true;\n  component_size++;\n\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    if (visited[i] == 0) {\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[v][j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t v = 0; v < N; ++v) {\n          if (A[u][v] && !visited[v]) {\n            q.push(v);\n            visited[v] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int max_component_size = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component containing it\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  return max_component_size;\n}\n\n// Perform a DFS to find the size of the component containing vertex i\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  // Mark the vertex as visited\n  visited[i] = true;\n  component_size++;\n\n  // Iterate over all adjacent vertices\n  for (size_t j = 0; j < A.size(); j++) {\n    // If there is an edge between vertex i and vertex j and vertex j has not been visited, perform a DFS on vertex j\n    if (A[i][j] == 1 && !visited[j]) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = dfs(i, A, visited);\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n\nint dfs(size_t i, std::vector<int> const& A, std::vector<int>& visited) {\n  if (visited[i] != 0) {\n    return 0;\n  }\n  visited[i] = 1;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(j, A, visited);\n    }\n  }\n  return component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  int largest = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int size = dfs(A, i, visited);\n      largest = std::max(largest, size);\n    }\n  }\n\n  return largest;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a vector to store the size of each component.\n  std::vector<int> componentSize(N, 0);\n\n  // Create a vector to store the component ID for each vertex.\n  std::vector<int> componentID(N, -1);\n\n  // Initialize the component ID for each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    componentID[i] = i;\n  }\n\n  // Iterate over each vertex.\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, perform a DFS to find all vertices in the same component.\n    if (componentID[i] == i) {\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit in the stack, perform a DFS.\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // If the vertex has not been visited, mark it as visited and add it to the stack.\n        if (componentID[vertex] == i) {\n          componentSize++;\n          componentID[vertex] = i;\n\n          // Iterate over the neighbors of the vertex.\n          for (size_t j = 0; j < N; ++j) {\n            // If the neighbor is connected to the vertex and has not been visited, add it to the stack.\n            if (A[i][j] == 1 && componentID[j] == i) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the size of the component.\n      componentSize[i] = componentSize;\n    }\n  }\n\n  // Return the size of the largest component.\n  return *std::max_element(componentSize.begin(), componentSize.end());\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int largest = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int size = 0;\n      dfs(A, visited, i, size);\n      largest = std::max(largest, size);\n    }\n  }\n  return largest;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<int>& visited, size_t i, int& size) {\n  if (visited[i] == 1) {\n    return;\n  }\n  visited[i] = 1;\n  size++;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j, size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      component_size++;\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[current][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            component_size++;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      dfs(A, i, visited, size);\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<int>& visited,\n         int& size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i * A.size() + j] == 1) {\n      dfs(A, j, visited, size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Perform DFS on each vertex to find all connected vertices\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) {\n          continue;\n        }\n        visited[u] = 1;\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (int i = 0; i < N; i++) {\n    int size = 0;\n    std::queue<int> q;\n    q.push(i);\n    while (!q.empty()) {\n      int v = q.front();\n      q.pop();\n      if (visited[v] == 1) continue;\n      visited[v] = 1;\n      size++;\n      for (int j = 0; j < N; j++) {\n        if (A[v][j] == 1 && visited[j] == 0) q.push(j);\n      }\n    }\n    max_size = std::max(max_size, size);\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_component = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        if (visited[current] == 0) {\n          visited[current] = 1;\n          component_size++;\n          for (size_t j = 0; j < N; j++) {\n            if (A[current][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      max_component = std::max(max_component, component_size);\n    }\n  }\n  return max_component;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  int max_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_size = std::max(max_size, size);\n    }\n  }\n  return max_size;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      dfs(A, i, visited, component_size);\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n  return max_component_size;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited, int& component_size) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  ++component_size;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, j, visited, component_size);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  int count = 0;\n  if (tid == 0) {\n    count = 1;\n    for (int i = 1; i < N; i++) {\n      if (A[bid * N + i] == 1) {\n        count++;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *largestComponentSize = count;\n  }\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint local_max = 0;\n\tint global_max = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tid_y] == 0) {\n\t\tint count = 0;\n\t\tvisited[tid_y] = 1;\n\t\tcount++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] != 0 && visited[i] == 0) {\n\t\t\t\tcount += dfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t\tlocal_max = count;\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (local_max > global_max) {\n\t\t\t\tglobal_max = local_max;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = global_max;\n\t}\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[tidx] == 0) {\n\t\tcount++;\n\t\tvisited[tidx] = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tidx * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcount += bfs(A, N, i, visited);\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = count;\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x + blockIdx.y * blockDim.x;\n\tint tidx_x = tidx % N;\n\tint tidx_y = tidx / N;\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcomponentSize++;\n\t\t}\n\t}\n\tatomicAdd(largestComponentSize, componentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int start = tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (int i = start; i < N; i += stride) {\n    if (A[i * N + i] == 0) {\n      continue;\n    }\n    localComponentSize++;\n    for (int j = i + 1; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localComponentSize++;\n      }\n    }\n  }\n  hipAtomicMax(&globalComponentSize, &localComponentSize);\n  *largestComponentSize = globalComponentSize;\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tidy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint size = 0;\n\tif (tidx < N && tidy < N) {\n\t\tif (A[tidx * N + tidy] == 1) {\n\t\t\tsize = 1;\n\t\t\tint stack[N];\n\t\t\tint top = 0;\n\t\t\tstack[top++] = tidx;\n\t\t\twhile (top > 0) {\n\t\t\t\tint v = stack[--top];\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tif (i == tidy) {\n\t\t\t\t\t\t\tsize++;\n\t\t\t\t\t\t} else {\n\t\t\t\t\t\t\tstack[top++] = i;\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tint max = 0;\n\thipReduceBlockMax(&max, &size, 1);\n\tif (tidx == 0 && tidy == 0) {\n\t\t*largestComponentSize = max;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint blockId = hipBlockIdx_x;\n\tint numBlocks = hipGridDim_x;\n\tint numThreads = hipBlockDim_x * hipGridDim_x;\n\tint visited[N];\n\tint componentSize[N];\n\tint maxComponentSize = 0;\n\tint maxComponentId = -1;\n\tint componentId = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t\tcomponentSize[i] = 0;\n\t}\n\twhile (componentId < N) {\n\t\tint startVertex = -1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (!visited[i]) {\n\t\t\t\tstartVertex = i;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (startVertex == -1) {\n\t\t\tbreak;\n\t\t}\n\t\tint queue[N];\n\t\tint head = 0;\n\t\tint tail = 0;\n\t\tqueue[tail++] = startVertex;\n\t\tvisited[startVertex] = 1;\n\t\tcomponentSize[componentId] = 1;\n\t\twhile (head != tail) {\n\t\t\tint vertex = queue[head++];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[vertex * N + i] && !visited[i]) {\n\t\t\t\t\tqueue[tail++] = i;\n\t\t\t\t\tvisited[i] = 1;\n\t\t\t\t\tcomponentSize[componentId]++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (componentSize[componentId] > maxComponentSize) {\n\t\t\tmaxComponentSize = componentSize[componentId];\n\t\t\tmaxComponentId = componentId;\n\t\t}\n\t\tcomponentId++;\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (A[i * N + j] == 1) {\n      local_max = 1;\n      for (int k = 0; k < N; k++) {\n        if (A[i * N + k] == 1) {\n          local_max++;\n        }\n        if (A[k * N + j] == 1) {\n          local_max++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    for (int k = 0; k < N * N; k++) {\n      if (local_max > global_max) {\n        global_max = local_max;\n      }\n      local_max = 0;\n    }\n    *largestComponentSize = global_max;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint visited[N];\n\tint componentSize = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\tcomponentSize = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tcomponentSize += largestComponent(A, N, &visited[j]);\n\t\t\t}\n\t\t}\n\t}\n\tint maxComponentSize = 0;\n\thipMemcpy(&maxComponentSize, largestComponentSize, sizeof(int), hipMemcpyDeviceToHost);\n\tif (componentSize > maxComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n\treturn;\n}\n",
+            "\n\n  int *visited = (int *)malloc(N * N * sizeof(int));\n  int *componentSize = (int *)malloc(N * N * sizeof(int));\n  int maxComponentSize = 0;\n  int componentId = 0;\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      visited[i * N + j] = 0;\n      componentSize[i * N + j] = 0;\n    }\n  }\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (visited[i * N + j] == 0 && A[i * N + j] == 1) {\n        componentId++;\n        int currentComponentSize = 0;\n        __syncthreads();\n        dfs(A, i, j, N, visited, componentSize, &currentComponentSize, componentId);\n        __syncthreads();\n        maxComponentSize = max(maxComponentSize, currentComponentSize);\n      }\n    }\n  }\n\n  *largestComponentSize = maxComponentSize;\n  free(visited);\n  free(componentSize);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  int myRow = blockIdx.x * blockDim.x + threadIdx.x;\n  int myCol = blockIdx.y * blockDim.y + threadIdx.y;\n  int myComponentSize = 0;\n  if (myRow < N && myCol < N) {\n    if (A[myRow * N + myCol] == 1) {\n      myComponentSize = 1;\n      for (int i = 0; i < N; i++) {\n        if (A[myRow * N + i] == 1) {\n          myComponentSize++;\n        }\n        if (A[i * N + myCol] == 1) {\n          myComponentSize++;\n        }\n      }\n    }\n  }\n  atomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __shared__ int shared_component_size;\n  shared_component_size = 0;\n  bool visited = false;\n  int component_size = 0;\n  while (!visited) {\n    visited = true;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (A[bid * N + i] == 1) {\n        visited = false;\n        component_size++;\n        for (int j = 0; j < N; j++) {\n          if (A[i * N + j] == 1) {\n            A[i * N + j] = 0;\n            A[j * N + i] = 0;\n          }\n        }\n      }\n    }\n    __syncthreads();\n  }\n  atomicAdd(largestComponentSize, component_size);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i, j, k, count = 0;\n\tif (tid < N) {\n\t\tbool visited[N] = {false};\n\t\tvisited[tid] = true;\n\t\tfor (i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\t\tcount++;\n\t\t\t\tfor (j = 0; j < N; j++) {\n\t\t\t\t\tif (A[i * N + j] == 1) {\n\t\t\t\t\t\tfor (k = 0; k < N; k++) {\n\t\t\t\t\t\t\tif (A[j * N + k] == 1 && !visited[k]) {\n\t\t\t\t\t\t\t\tcount++;\n\t\t\t\t\t\t\t\tvisited[k] = true;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == 0) {\n    *largestComponentSize = 0;\n  }\n  __syncthreads();\n  int localComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid + i * N] == 1) {\n      localComponentSize++;\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    if (localComponentSize > *largestComponentSize) {\n      *largestComponentSize = localComponentSize;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int size = 0;\n  if (tid < N && tid_y < N) {\n    if (A[tid * N + tid_y] == 1) {\n      size = 1;\n      int *visited = new int[N];\n      for (int i = 0; i < N; i++) {\n        visited[i] = 0;\n      }\n      visited[tid] = 1;\n      int *stack = new int[N];\n      stack[0] = tid;\n      while (stack[0] != -1) {\n        int v = stack[0];\n        stack[0] = -1;\n        for (int i = 0; i < N; i++) {\n          if (A[v * N + i] == 1 && visited[i] == 0) {\n            visited[i] = 1;\n            size++;\n            stack[0] = i;\n          }\n        }\n      }\n      delete[] visited;\n      delete[] stack;\n    }\n  }\n  atomicAdd(largestComponentSize, size);\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(&*largestComponentSize, myComponentSize);\n}\n",
+            "(",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_x;\n\tint componentSize = 0;\n\tif (tid == 0) {\n\t\tcomponentSize = 1;\n\t\tlargestComponentSize[bid] = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = tid + 1; i < N; i++) {\n\t\tif (A[bid * N + i] == 1) {\n\t\t\tatomicAdd(&componentSize, 1);\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > largestComponentSize[bid]) {\n\t\t\tlargestComponentSize[bid] = componentSize;\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Perform a breadth-first search (BFS) from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks to find the global maximum.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "_t.",
+            "\n\n  // Perform a breadth-first search from each vertex to find the size of each component.\n  std::vector<int> visited(N, false);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int componentSize = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++componentSize;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u][j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(my_rank);\n  visited[my_rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a DFS from each rank\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        local_component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(&component_size[0], &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_max = 0;\n  int global_max = 0;\n\n  // Perform a DFS on the graph to find the size of the largest component\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          component_size++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[vertex][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      local_max = std::max(local_max, component_size);\n    }\n  }\n\n  // Reduce the local maximum values to find the global maximum\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      my_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n            ++my_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int max_component = 0;\n  int my_component = 0;\n  int* visited = new int[N];\n  for (size_t i = 0; i < N; i++) {\n    visited[i] = 0;\n  }\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      my_component = 0;\n      dfs(i, A, N, visited, my_component);\n      max_component = std::max(max_component, my_component);\n    }\n  }\n  delete[] visited;\n  int global_max;\n  MPI_Allreduce(&max_component, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int v = q.front();\n        q.pop();\n        if (visited[v] == 0) {\n          visited[v] = 1;\n          componentSize++;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Allreduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the global maximum component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  std::queue<int> q;\n  q.push(rank);\n  visited[rank] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    local_component_size++;\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        visited[v] = true;\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of vertices in its component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      local_component_size = std::max(local_component_size, dfs(A, i, visited));\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n\n// Depth-first search to compute the number of vertices in a component\nint dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  if (visited[i]) {\n    return 0;\n  }\n\n  visited[i] = true;\n  int component_size = 1;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      component_size += dfs(A, j, visited);\n    }\n  }\n\n  return component_size;\n}\n",
+            "_t.",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            ++local_component_size;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  std::vector<int> visited(N, 0);\n  int local_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_component_size = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n            local_component_size++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its component\n  std::vector<int> visited(N, 0);\n  std::queue<int> q;\n  int componentSize = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      q.push(i);\n      visited[i] = 1;\n      componentSize = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize++;\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int globalComponentSize;\n  MPI_Reduce(&componentSize, &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a depth-first search to find all the vertices in the largest component.\n  std::vector<bool> visited(N, false);\n  int largest_component_size = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          ++component_size;\n          visited[v] = true;\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      largest_component_size = std::max(largest_component_size, component_size);\n    }\n  }\n\n  // Reduce the largest component size across all ranks.\n  int global_largest_component_size;\n  MPI_Allreduce(&largest_component_size, &global_largest_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Return the largest component size\n  return largestComponentSize;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   Store the result in largestComponentSize.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void largestComponent(const int *A, size_t N, int *largestComponentSize) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myComponentSize = 0;\n  int myId = blockIdx.x * blockDim.x + threadIdx.x;\n  if (A[myId * N + myId] == 0) {\n    *largestComponentSize = 0;\n    return;\n  }\n  int *visited = new int[N];\n  for (int i = 0; i < N; i++) { visited[i] = 0; }\n  visited[myId] = 1;\n  myComponentSize++;\n  for (int i = 0; i < N; i++) {\n    if (A[myId * N + i] == 1 && visited[i] == 0) {\n      myComponentSize += dfs(A, N, visited, i);\n    }\n  }\n  *largestComponentSize = max(*largestComponentSize, myComponentSize);\n}\n",
+            "\n\n  int localComponentSize = 0;\n  int globalComponentSize = 0;\n  int threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n  int threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (threadIdx < N && threadIdy < N) {\n    if (A[threadIdx * N + threadIdy] == 1) {\n      localComponentSize++;\n      for (int i = 0; i < N; i++) {\n        if (A[threadIdx * N + i] == 1) {\n          localComponentSize++;\n        }\n        if (A[i * N + threadIdy] == 1) {\n          localComponentSize++;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    globalComponentSize = localComponentSize;\n  }\n  for (int i = 1; i < N; i++) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    localComponentSize = 0;\n    if (threadIdx < N && threadIdy + i < N) {\n      if (A[threadIdx * N + threadIdy + i] == 1) {\n        localComponentSize++;\n        for (int j = 0; j < N; j++) {\n          if (A[threadIdx * N + j] == 1) {\n            localComponentSize++;\n          }\n          if (A[(threadIdy + i) * N + j] == 1) {\n            localComponentSize++;\n          }\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (threadIdx == 0 && threadIdy == 0) {\n    if (localComponentSize > globalComponentSize) {\n      globalComponentSize = localComponentSize;\n    }\n    *largestComponentSize = globalComponentSize;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_row = tid / N;\n  int tid_col = tid % N;\n  int local_max = 0;\n  int global_max = 0;\n\n  // Count the number of adjacent vertices\n  for (int i = 0; i < N; i++) {\n    if (A[tid_row * N + i] == 1) {\n      local_max++;\n    }\n  }\n\n  // Find the maximum number of adjacent vertices across all threads\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (local_max > global_max) {\n      global_max = local_max;\n    }\n  }\n\n  *largestComponentSize = global_max;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tif (count > *largestComponentSize) {\n\t\t\t*largestComponentSize = count;\n\t\t}\n\t}\n}\n",
+            "\n\n  // Each thread computes the size of its component\n  int myComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[i * N + blockIdx.x] == 1) {\n      myComponentSize++;\n    }\n  }\n\n  // Reduce the component sizes across all threads\n  __shared__ int sharedComponentSize[32];\n  sharedComponentSize[threadIdx.x] = myComponentSize;\n  __syncthreads();\n  for (int i = 1; i < 32; i *= 2) {\n    if (threadIdx.x % (2 * i) == 0) {\n      sharedComponentSize[threadIdx.x] += sharedComponentSize[threadIdx.x + i];\n    }\n    __syncthreads();\n  }\n\n  // Store the largest component size in the output variable\n  if (threadIdx.x == 0) {\n    *largestComponentSize = sharedComponentSize[0];\n  }\n}\n",
+            "\n\n\t// Initialize the largest component size to 0\n\t*largestComponentSize = 0;\n\n\t// Iterate over each vertex\n\tfor (int i = 0; i < N; i++) {\n\t\t// If the vertex is not visited, perform a depth-first search to find the size of the component\n\t\tif (A[i * N + i] == 0) {\n\t\t\tint componentSize = 0;\n\t\t\tdepthFirstSearch(A, N, i, &componentSize);\n\n\t\t\t// Update the largest component size if necessary\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\n\tint count = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (tid == 0) {\n\t\t*largestComponentSize = max(*largestComponentSize, count);\n\t}\n}\n",
+            "\n\n  // Each thread checks if it is in the largest component.\n  int myComponentSize = 0;\n  for (int i = blockIdx.x; i < N; i += blockDim.x) {\n    for (int j = blockIdx.y; j < N; j += blockDim.y) {\n      if (A[i * N + j] == 1) {\n        myComponentSize++;\n      }\n    }\n  }\n\n  // Each thread writes its component size to shared memory.\n  __shared__ int sharedComponentSize[blockDim.x * blockDim.y];\n  sharedComponentSize[threadIdx.x * blockDim.y + threadIdx.y] = myComponentSize;\n\n  // Each thread block finds the maximum component size in shared memory.\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < blockDim.x * blockDim.y; i++) {\n    maxComponentSize = max(maxComponentSize, sharedComponentSize[i]);\n  }\n\n  // The thread with the maximum component size writes it to global memory.\n  if (threadIdx.x == 0 && threadIdx.y == 0) {\n    *largestComponentSize = maxComponentSize;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 0) {\n\t\treturn;\n\t}\n\tint componentSize = 0;\n\tint visited[N];\n\tfor (int k = 0; k < N; k++) {\n\t\tvisited[k] = 0;\n\t}\n\t__syncthreads();\n\tif (visited[i] == 0) {\n\t\tcomponentSize = dfs(A, N, i, visited);\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tif (componentSize > *largestComponentSize) {\n\t\t\t*largestComponentSize = componentSize;\n\t\t}\n\t}\n}\n",
+            "\n\n  int myComponentSize = 0;\n  int myComponentId = threadIdx.x + blockIdx.x * blockDim.x;\n  if (myComponentId < N) {\n    bool visited[N];\n    for (int i = 0; i < N; i++) { visited[i] = false; }\n    visited[myComponentId] = true;\n    for (int i = 0; i < N; i++) {\n      if (A[myComponentId * N + i] && !visited[i]) {\n        myComponentSize += 1;\n        dfs(A, N, visited, i);\n      }\n    }\n  }\n  __syncthreads();\n  int maxComponentSize = 0;\n  for (int i = 0; i < N; i++) {\n    maxComponentSize = max(maxComponentSize, myComponentSize);\n  }\n  *largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\n\tint row = tid / N;\n\tint col = tid % N;\n\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\n\tint componentSize = 1;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\n\tvisited[row] = 1;\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1 && visited[i] == 0) {\n\t\t\tcomponentSize += dfs(A, N, i, visited);\n\t\t}\n\t}\n\n\tif (componentSize > *largestComponentSize) {\n\t\t*largestComponentSize = componentSize;\n\t}\n}\n",
+            "\n\n\tint myRow = blockIdx.x * blockDim.x + threadIdx.x;\n\tint myCol = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tint componentSize = 0;\n\t\t\tint visited[N];\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tvisited[i] = 0;\n\t\t\t}\n\t\t\tdfs(myRow, myCol, A, N, visited, &componentSize);\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint componentSize = 0;\n\tint maxComponentSize = 0;\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (visited[i] == 0) {\n\t\t\tcomponentSize = 0;\n\t\t\tdfs(A, i, visited, &componentSize);\n\t\t\tmaxComponentSize = max(maxComponentSize, componentSize);\n\t\t}\n\t}\n\t*largestComponentSize = maxComponentSize;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tint componentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_row * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = max(*largestComponentSize, componentSize);\n\t}\n}\n",
+            "\n\n\tint myComponentSize = 0;\n\tint myId = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (A[N * myId + myId] == 1) {\n\t\tmyComponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[N * myId + i] == 1 && i != myId) {\n\t\t\t\tmyComponentSize += 1;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, myComponentSize);\n}\n",
+            "\n\n\tint threadIdx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint threadIdy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (threadIdx < N && threadIdy < N) {\n\t\tif (A[threadIdx * N + threadIdy] == 1) {\n\t\t\t// Found a 1 in the adjacency matrix.\n\t\t\t// Perform a breadth-first search to find the size of the component.\n\t\t\tint componentSize = 0;\n\t\t\tstd::queue<int> q;\n\t\t\tq.push(threadIdx);\n\t\t\twhile (!q.empty()) {\n\t\t\t\tint v = q.front();\n\t\t\t\tq.pop();\n\t\t\t\tcomponentSize++;\n\t\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\t\tif (A[v * N + i] == 1 && i != v) {\n\t\t\t\t\t\tq.push(i);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Update the largest component size if necessary.\n\t\t\tif (componentSize > *largestComponentSize) {\n\t\t\t\t*largestComponentSize = componentSize;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\tatomicMax(largestComponentSize, componentSize);\n}\n",
+            "\n\n\tint localLargestComponentSize = 0;\n\tint globalLargestComponentSize = 0;\n\tint myRow = blockIdx.y * blockDim.y + threadIdx.y;\n\tint myCol = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (myRow < N && myCol < N) {\n\t\tif (A[myRow * N + myCol] == 1) {\n\t\t\tlocalLargestComponentSize++;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (threadIdx.x == 0 && threadIdx.y == 0) {\n\t\tfor (int i = 1; i < gridDim.x * gridDim.y; i++) {\n\t\t\tif (localLargestComponentSize < globalLargestComponentSize) {\n\t\t\t\tlocalLargestComponentSize = globalLargestComponentSize;\n\t\t\t}\n\t\t}\n\t\t*largestComponentSize = localLargestComponentSize;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint componentSize = 0;\n\tif (A[tid_y * N + tid_x] == 1) {\n\t\tcomponentSize = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid_y * N + i] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t\tif (A[i * N + tid_x] == 1) {\n\t\t\t\tcomponentSize++;\n\t\t\t}\n\t\t}\n\t}\n\t*largestComponentSize = max(*largestComponentSize, componentSize);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "16_graph_largest_component",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint largestComponent(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce to find the largest component size\n  int global_component_size;\n  MPI_Allreduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform DFS on each rank\n  std::vector<bool> visited(N, false);\n  int max_component_size = 0;\n  #pragma omp parallel\n  {\n    int local_max_component_size = 0;\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      if (!visited[i]) {\n        std::stack<size_t> s;\n        s.push(i);\n        int component_size = 0;\n        while (!s.empty()) {\n          size_t u = s.top();\n          s.pop();\n          if (!visited[u]) {\n            visited[u] = true;\n            component_size++;\n            for (size_t j = 0; j < N; ++j) {\n              if (A[u * N + j] && !visited[j]) {\n                s.push(j);\n              }\n            }\n          }\n        }\n        local_max_component_size = std::max(local_max_component_size, component_size);\n      }\n    }\n    #pragma omp critical\n    {\n      max_component_size = std::max(max_component_size, local_max_component_size);\n    }\n  }\n\n  // Gather the maximum component size from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int recv_max_component_size;\n      MPI_Recv(&recv_max_component_size, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      max_component_size = std::max(max_component_size, recv_max_component_size);\n    }\n  } else {\n    MPI_Send(&max_component_size, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n\n  return max_component_size;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find the connected components.\n  std::vector<int> visited(N, 0);\n  std::vector<int> component_size(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      int component_size = 0;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_size[i] = component_size;\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  MPI_Allreduce(&max_component_size, &max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int largest_component_size;\n  MPI_Reduce(&my_component_size, &largest_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return largest_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the size of its own component\n  int local_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_size = std::max(local_size, size);\n    }\n  }\n\n  // Reduce the local sizes to find the global maximum\n  int global_size;\n  MPI_Reduce(&local_size, &global_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  // Perform a breadth-first search from each vertex to find all the vertices in the same component.\n  int max_component_size = 0;\n  #pragma omp parallel for reduction(max : max_component_size)\n  for (size_t i = 0; i < N; ++i) {\n    if (A[i][i] == 0) {\n      continue;\n    }\n\n    // Initialize the queue with the current vertex.\n    std::queue<size_t> queue;\n    queue.push(i);\n\n    // Mark the current vertex as visited.\n    A[i][i] = 0;\n\n    // Perform a breadth-first search to find all the vertices in the same component.\n    int component_size = 0;\n    while (!queue.empty()) {\n      size_t current_vertex = queue.front();\n      queue.pop();\n      component_size++;\n\n      // Add all the unvisited neighbors of the current vertex to the queue.\n      for (size_t j = 0; j < N; ++j) {\n        if (A[current_vertex][j] == 1 && A[j][j] != 0) {\n          queue.push(j);\n          A[j][j] = 0;\n        }\n      }\n    }\n\n    max_component_size = std::max(max_component_size, component_size);\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(size, 0);\n\n  // Perform a DFS on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int localComponentSize = 0;\n      std::stack<size_t> s;\n      s.push(i);\n      visited[i] = true;\n      while (!s.empty()) {\n        size_t u = s.top();\n        s.pop();\n        localComponentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            s.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      componentSize[rank] = std::max(componentSize[rank], localComponentSize);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int globalComponentSize;\n  MPI_Reduce(&componentSize[0], &globalComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (rank == 0) {\n    return globalComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform a breadth-first search (BFS) from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int maxComponentSize = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int componentSize = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        componentSize++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int globalMaxComponentSize;\n  MPI_Reduce(&maxComponentSize, &globalMaxComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (rank == 0) {\n    return globalMaxComponentSize;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Perform a breadth-first search to find the largest component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the max_component_size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the result on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the number of vertices in its largest component\n  int local_max = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      int count = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        count++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_max = std::max(local_max, count);\n    }\n  }\n\n  // Reduce the local maxima to find the global maximum\n  int global_max;\n  MPI_Reduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the global maximum on rank 0\n  if (my_rank == 0) {\n    return global_max;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the largest component size\n  int largestComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a DFS to find the size of the component\n    if (!visited[i]) {\n      int componentSize = 0;\n      dfs(A, i, visited, componentSize);\n      componentSize[i] = componentSize;\n\n      // Update the largest component size\n      largestComponentSize = std::max(largestComponentSize, componentSize);\n    }\n  }\n\n  // Reduce the largest component size to rank 0\n  int globalLargestComponentSize;\n  MPI_Reduce(&largestComponentSize, &globalLargestComponentSize, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size\n  return globalLargestComponentSize;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> componentSize(N, 0);\n\n  // Initialize the maximum component size\n  int maxComponentSize = 0;\n\n  // Loop over all vertices\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited\n    if (!visited[i]) {\n      // Perform a depth-first search to find the size of the component containing vertex i\n      int componentSize = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          componentSize++;\n          // Push all adjacent vertices onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n\n      // Update the maximum component size\n      maxComponentSize = std::max(maxComponentSize, componentSize);\n    }\n  }\n\n  // Return the maximum component size\n  return maxComponentSize;\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Perform a breadth-first search from each vertex to find all vertices in the same component.\n  std::vector<int> visited(N, 0);\n  int max_component_size = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int component_size = 0;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        component_size++;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      max_component_size = std::max(max_component_size, component_size);\n    }\n  }\n\n  // Reduce the maximum component size across all ranks.\n  int global_max_component_size;\n  MPI_Allreduce(&max_component_size, &global_max_component_size, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  // Return the maximum component size on rank 0.\n  if (my_rank == 0) {\n    return global_max_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes the size of its own component\n  int my_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      my_component_size = std::max(my_component_size, component_size);\n    }\n  }\n\n  // Reduce the component sizes to find the largest one\n  int global_component_size;\n  MPI_Reduce(&my_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  std::vector<int> componentSize(N, 0);\n  int largest = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      componentSize[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n            componentSize[i]++;\n          }\n        }\n      }\n      largest = std::max(largest, componentSize[i]);\n    }\n  }\n\n  int globalLargest = 0;\n  MPI_Reduce(&largest, &globalLargest, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalLargest;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the size of its own component\n  int local_component_size = 0;\n  std::vector<bool> visited(N, false);\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int component_size = 0;\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = true;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        ++component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      local_component_size = std::max(local_component_size, component_size);\n    }\n  }\n\n  // Reduce the local component sizes to find the largest component\n  int global_component_size;\n  MPI_Reduce(&local_component_size, &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component size array\n  std::vector<int> component_size(num_ranks, 0);\n\n  // Perform a parallel depth-first search to find the largest component\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      int local_component_size = 0;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        ++local_component_size;\n        for (size_t j = 0; j < N; ++j) {\n          if (A[vertex * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n      component_size[my_rank] = std::max(component_size[my_rank], local_component_size);\n    }\n  }\n\n  // Reduce the component size array to find the largest component\n  int global_component_size = 0;\n  MPI_Reduce(component_size.data(), &global_component_size, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  // Return the largest component size on rank 0\n  if (my_rank == 0) {\n    return global_component_size;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n\t Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (visited[i] == 0) {\n      visited[i] = 1;\n      Kokkos::View<int*> stack(\"stack\", N);\n      size_t stack_size = 0;\n      stack[stack_size++] = i;\n      while (stack_size > 0) {\n        size_t j = stack[--stack_size];\n        for (size_t k = 0; k < N; k++) {\n          if (A(j, k) == 1 && visited[k] == 0) {\n            visited[k] = 1;\n            stack[stack_size++] = k;\n          }\n        }\n      }\n    }\n  });\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    count += (visited[i] == 1);\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t k) {\n            if (A(j, k) == 1 && visited(k) == 0) {\n              Kokkos::atomic_fetch_add(&visited(k), 1);\n            }\n          });\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  KokConfigService::getSingleton().setNumThreads(N);\n  KokConfigService::getSingleton().setParallelMode(KokConfigService::ParallelMode::OMP);\n  KokConfigService::getSingleton().setOmpNumThreads(N);\n\n  Kok::View<int*> counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { counts[i] = i; });\n\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) {\n    for (size_t j = 0; j < N; j++) {\n      if (A(i, j) == 1 && i != j) { counts[j] = counts[i]; }\n    }\n  });\n\n  Kok::View<int> unique_counts(N);\n  Kok::parallel_for(Kok::Range(0, N), KOK_LAMBDA(int i) { unique_counts[i] = counts[i]; });\n  Kok::sort(unique_counts);\n  int num_components = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (i == 0 || unique_counts[i] != unique_counts[i - 1]) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      visit(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n          });\n          visited[j] = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &k) {\n            if (A(j, k) == 1 && visited(k) == 0) { visited(k) = 1; }\n          });\n          visited(j) = 1;\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      ++count;\n      dfs(A, visited, i, count);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> labels(\"labels\", N);\n  int num_components = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    if (labels(i) == 0) {\n      int new_label = ++num_components;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &j) {\n        if (A(i, j) == 1 && labels(j) == 0) {\n          labels(j) = new_label;\n        }\n      });\n    }\n  });\n  return num_components;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n        if (A(i, j) == 1 && visited[j] == 0) {\n          dfs(A, visited, j, N);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited[j] == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1 && visited[k] == 0) { visited[k] = 1; }\n                  });\n              visited[j] = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited() = 0;\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          Kokkos::atomic_fetch_add(&visited(j), 1);\n          componentCount(A, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n        if (A(i, j) == 1 && visited(j) == 0) {\n          visited(j) = 1;\n          componentCount(A, N, j);\n        }\n      });\n    }\n  }\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  visited.fill(0);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (visited(i) == 0) {\n      count++;\n      Kokkos::parallel_for(\n          N, KOKKOS_LAMBDA(const size_t &j) {\n            if (A(i, j) == 1 && visited(j) == 0) {\n              Kokkos::parallel_for(\n                  N, KOKKOS_LAMBDA(const size_t &k) {\n                    if (A(j, k) == 1) visited(k) = 1;\n                  });\n              visited(j) = 1;\n            }\n          });\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int nComponents = 0;\n  std::vector<int> visited(N, 0);\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return nComponents;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        if (!visited) {\n          count++;\n          visited = true;\n        }\n        break;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t curr = q.front();\n        q.pop();\n        if (visited[curr] == 0) {\n          visited[curr] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[curr][j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      dfs(A, i, visited);\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int* visited = new int[N];\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return count;\n}\n",
+            "\n\n  int nComponents = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n\n  #pragma omp parallel for reduction(+:nComponents)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      nComponents++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1 && visited[j] == 0) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return nComponents;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      count++;\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t current = q.front();\n        q.pop();\n        visited[current] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[current * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    bool is_new_component = true;\n    for (size_t j = 0; j < i; j++) {\n      if (A[i * N + j] == 1) {\n        is_new_component = false;\n        break;\n      }\n    }\n    if (is_new_component) { num_components++; }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over the vertices\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the vertex has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t v = stack.top();\n        stack.pop();\n        if (!visited[v]) {\n          visited[v] = true;\n          // Push all the unvisited neighbors of v onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[v][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n\n#pragma omp parallel for reduction(+ : num_components)\n  for (size_t i = 0; i < N; ++i) {\n    if (visited[i] == 0) {\n      std::queue<size_t> q;\n      q.push(i);\n      visited[i] = 1;\n      num_components++;\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; ++j) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  // Initialize the visited array to false for all nodes\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all nodes\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over all neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the number of connected components\n  return componentCount;\n}\n",
+            "_",
+            "\n\n  int num_components = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) {\n              q.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n  return num_components;\n}\n",
+            "\n\n  int num_components = 0;\n  int* visited = new int[N];\n  memset(visited, 0, sizeof(int) * N);\n\n#pragma omp parallel for reduction(+:num_components)\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      num_components++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t cur = q.front();\n        q.pop();\n        if (visited[cur] == 0) {\n          visited[cur] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[cur][j] == 1) { q.push(j); }\n          }\n        }\n      }\n    }\n  }\n  delete[] visited;\n  return num_components;\n}\n",
+            "\n\n  int count = 0;\n  std::vector<bool> visited(N, false);\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      count++;\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, visited, i, N);\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, std::vector<bool>& visited, size_t i) {\n  if (visited[i]) {\n    return;\n  }\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] == 1) {\n      dfs(A, visited, j);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n  return count;\n}\n\nvoid dfs(std::vector<int> const& A, size_t i, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); j++) {\n    if (A[i][j] && !visited[j]) {\n      dfs(A, j, visited);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(i, A, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n\nvoid dfs(size_t i, std::vector<int> const& A, std::vector<bool>& visited) {\n  visited[i] = true;\n  for (size_t j = 0; j < A.size(); ++j) {\n    if (A[i][j] && !visited[j]) {\n      dfs(j, A, visited);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, visited, i);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      ++count;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      dfs(A, i, visited);\n      count++;\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tif (count == 0) {\n\t\tcount = 1;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n\tint tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint component_id = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tcomponent_id = min(component_id, i);\n\t\t}\n\t}\n\tatomicAdd(&numComponents[component_id], 1);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint root = tid;\n\twhile (A[root * N + bid] != root) {\n\t\troot = A[root * N + bid];\n\t}\n\tif (tid == 0) {\n\t\tA[bid * N + bid] = root;\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint numThreads = hipGridDim_x * hipBlockDim_x;\n\tint label = tid;\n\tint numLabels = numThreads;\n\tbool changed = true;\n\twhile (changed) {\n\t\tchanged = false;\n\t\tfor (int i = tid; i < N; i += numThreads) {\n\t\t\tfor (int j = 0; j < N; j++) {\n\t\t\t\tif (A[i * N + j] && label > j) {\n\t\t\t\t\tlabel = j;\n\t\t\t\t\tchanged = true;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tif (tid == 0) {\n\t\t\tif (label < numLabels) {\n\t\t\t\tnumLabels = label;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\t*numComponents = numLabels;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] != 0) {\n      count++;\n      for (int j = i + 1; j < N; j++) {\n        if (A[tid * N + j] != 0) {\n          count++;\n        }\n      }\n      break;\n    }\n  }\n  atomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint component = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcomponent = 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tatomicAdd(numComponents, component);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N || tid_y >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tbool visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = false;\n\t}\n\tvisited[tid] = true;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1 && !visited[i]) {\n\t\t\tcount++;\n\t\t\t__syncblocks();\n\t\t\tDFS(A, i, visited, N);\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = tid / N;\n\tint tid_x = tid % N;\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx.x + hipBlockIdx.x * hipBlockDim.x;\n\tint bid = hipBlockIdx.y + hipBlockIdx.z * hipGridDim.z;\n\tint i = tid + bid * hipBlockDim.x * hipGridDim.x;\n\tint count = 0;\n\twhile (i < N * N) {\n\t\tif (A[i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t\ti += N * N;\n\t}\n\tatomicAdd(numComponents, count);\n}\n",
+            "\n\n\tint id = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (id >= N) return;\n\tint count = 0;\n\tbool visited[N];\n\tmemset(visited, 0, sizeof(visited));\n\tfor (int i = 0; i < N; i++) {\n\t  if (visited[i]) continue;\n\t  visited[i] = true;\n\t  count++;\n\t  for (int j = 0; j < N; j++) {\n\t    if (A[i * N + j] == 0) continue;\n\t    visited[j] = true;\n\t  }\n\t}\n\tnumComponents[id] = count;\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tint count = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tint n = hipBlockDim_x * hipGridDim_x;\n\tint count = 0;\n\tif (tid == 0) {\n\t\tcount = 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == 0) {\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      local_count++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM);\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N];\n    std::fill(visited, visited + N, false);\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] && !visited[j]) {\n        ++localCount;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; ++v) {\n            if (A[u * N + v] && !visited[v]) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int component_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      component_count++;\n    }\n  }\n\n  int global_component_count;\n  MPI_Reduce(&component_count, &global_component_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_component_count;\n  }\n  return 0;\n}\n",
+            "\n\n  // Initialize the visited array to false for all vertices\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over all vertices\n  for (size_t i = 0; i < N; i++) {\n    // If the vertex has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      // While there are still vertices to visit\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t vertex = stack.top();\n        stack.pop();\n\n        // Mark the vertex as visited\n        visited[vertex] = true;\n\n        // Iterate over all adjacent vertices\n        for (size_t j = 0; j < N; j++) {\n          // If the adjacent vertex is connected and has not been visited, push it onto the stack\n          if (A[vertex][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n\n      // Perform a breadth-first search to visit all nodes in the component\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        visited[u] = true;\n\n        // Iterate over the neighbors of u\n        for (size_t j = 0; j < N; j++) {\n          if (A[u][j] == 1 && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its portion of the graph.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        localCount++;\n        break;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      count++;\n      while (!q.empty()) {\n        int cur = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[cur * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) return global_count;\n  return -1;\n}\n",
+            "\n\n  int n_components = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      n_components++;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        visited[u] = 1;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  int global_n_components;\n  MPI_Allreduce(&n_components, &global_n_components, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_n_components;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> visited(N, 0);\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      count++;\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n\n  int globalCount;\n  MPI_Reduce(&count, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (!visited[i]) {\n      count++;\n      std::queue<size_t> q;\n      q.push(i);\n      while (!q.empty()) {\n        size_t u = q.front();\n        q.pop();\n        if (visited[u]) continue;\n        visited[u] = true;\n        for (size_t j = 0; j < N; j++) {\n          if (A[u * N + j] && !visited[j]) {\n            q.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local rows\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited = false;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] && !visited) {\n        localCount++;\n        visited = true;\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          for (size_t j = 0; j < N; j++) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t u = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[u] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then push the neighbor onto the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Iterate over the neighbors of the vertex\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex][j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of components in its local part of the graph\n  int local_count = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        // Found a new component\n        local_count++;\n        std::queue<size_t> q;\n        q.push(j);\n        while (!q.empty()) {\n          size_t u = q.front();\n          q.pop();\n          visited[u] = true;\n          for (size_t v = 0; v < N; v++) {\n            if (!visited[v] && A[u][v] == 1) {\n              q.push(v);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor is connected and has not been visited, then add it to the stack\n          if (A[node][j] == 1 && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of connected components in its own subset of rows.\n  int localCount = 0;\n  for (size_t i = rank; i < N; i += size) {\n    bool visited[N] = {false};\n    for (size_t j = 0; j < N; j++) {\n      if (!visited[j] && A[i][j] == 1) {\n        localCount++;\n        dfs(A, visited, j, N);\n      }\n    }\n  }\n\n  // Reduce the local counts to get the global count.\n  int globalCount;\n  MPI_Reduce(&localCount, &globalCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0.\n  if (rank == 0) {\n    return globalCount;\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\n__global__ void componentCount(const int *A, size_t N, int *numComponents) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] != 0) {\n\t\t\tA[row * N + i] = 0;\n\t\t\tA[i * N + row] = 0;\n\t\t\tcomponentCount<<<1, N>>>(A, N, numComponents);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = -1;\n\t\t} else {\n\t\t\tnumComponents[tid] = tid;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents[tid] != -1) {\n\t\t\tnumComponents[k] = numComponents[tid];\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid < N && tid_y < N) {\n\t  if (A[tid * N + tid_y] == 1) {\n\t    atomicAdd(numComponents, 1);\n\t  }\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tint row = tid / N;\n\tint col = tid % N;\n\tif (A[tid] == 0) {\n\t\tnumComponents[tid] = 0;\n\t\treturn;\n\t}\n\tint component = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[row * N + i] == 1) {\n\t\t\tcomponent = i;\n\t\t\tbreak;\n\t\t}\n\t}\n\tnumComponents[tid] = component;\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tbool visited = false;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tvisited = true;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tif (visited) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Each thread checks if the current element is non-zero.\n\t// If so, it increments the component count.\n\tint row = blockIdx.x;\n\tint col = blockIdx.y;\n\tif (A[row * N + col] != 0) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] == 1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t(*numComponents)++;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i < j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && numComponents_local != -1) {\n\t\t\tnumComponents[k] = numComponents_local;\n\t\t}\n\t}\n\t__syncthreads();\n\tif (i == 0) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (numComponents[k] != -1) {\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t*numComponents = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint bid = blockIdx.y * blockDim.y + blockIdx.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[bid] == 0) {\n\t\tvisited[bid] = 1;\n\t\t*numComponents += 1;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tif (A[bid * N + i] == 1 && visited[i] == 0) {\n\t\t\t\tcomponentCount(A, N, numComponents);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tidy = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tidx >= N || tidy >= N) {\n\t\treturn;\n\t}\n\tif (A[tidx * N + tidy] == 0) {\n\t\treturn;\n\t}\n\tatomicAdd(numComponents, 1);\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == 0) {\n\t\t*numComponents = 0;\n\t}\n\t__syncthreads();\n\tint visited[N];\n\tfor (int i = 0; i < N; i++) {\n\t\tvisited[i] = 0;\n\t}\n\tif (visited[tid] == 0) {\n\t\tvisited[tid] = 1;\n\t\t__syncthreads();\n\t\tint count = 1;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tif (A[tid * N + j] == 1 && visited[j] == 0) {\n\t\t\t\tvisited[j] = 1;\n\t\t\t\tcount++;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t\tatomicAdd(numComponents, count);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n\tif (tid >= N * N) {\n\t\treturn;\n\t}\n\tif (A[tid] == 0) {\n\t\treturn;\n\t}\n\tint component = tid;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tcomponent = min(component, i);\n\t\t}\n\t}\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_y * N + i] == 1) {\n\t\t\tA[tid_y * N + i] = component;\n\t\t}\n\t}\n\tif (tid == component) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j) {\n\t\tif (A[i * N + j] == 0) {\n\t\t\tnumComponents[tid] = 0;\n\t\t} else {\n\t\t\tnumComponents[tid] = 1;\n\t\t}\n\t} else {\n\t\tnumComponents[tid] = -1;\n\t}\n\t__syncthreads();\n\tint numComponents_local = numComponents[tid];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1) {\n\t\t\tnumComponents_local = min(numComponents_local, numComponents[k * N + j]);\n\t\t}\n\t}\n\tnumComponents[tid] = numComponents_local;\n\t__syncthreads();\n\tif (i == j) {\n\t\tint count = 0;\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tcount += (numComponents[k * N + j] == numComponents_local);\n\t\t}\n\t\tnumComponents[tid] = count;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid_row == tid_col) {\n\t\tif (A[tid] == 1) {\n\t\t\tatomicAdd(numComponents, 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == j && A[i * N + j] == 1) {\n\t\tatomicAdd(numComponents, 1);\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx >= N || tidx >= N) {\n\t\treturn;\n\t}\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tidx * N + i] == 1) {\n\t\t\tcount++;\n\t\t}\n\t}\n\tatomicAdd(numComponents, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "18_graph_count_components",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n\t Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The component count is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n   output: 2\n*/\nint componentCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u * N + j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n      local_count++;\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+: count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top vertex from the stack\n        size_t v = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the vertex\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the vertex, add it to the stack\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a visited array to keep track of which nodes have been visited\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count to 0\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // If the node has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over all the neighbors of the node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and there is an edge between the node and the neighbor, then add the neighbor to the stack\n          if (!visited[j] && A[node * N + j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  std::vector<int> visited(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    if (visited[i] == 0) {\n      int count = 0;\n      std::queue<int> q;\n      q.push(i);\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        if (visited[u] == 0) {\n          count++;\n          visited[u] = 1;\n          for (size_t j = 0; j < N; j++) {\n            if (A[u * N + j] == 1) { q.push(j); }\n          }\n        }\n      }\n      local_count += count > 0;\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) { return global_count; }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a depth-first search to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[node] = true;\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If the neighbor has not been visited and is connected to the node, then visit the neighbor\n          if (!visited[j] && A[i][j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      count++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Iterate over the neighbors of the node\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node][j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Visit the row\n      visited[i] = true;\n\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        // Pop the top node from the stack\n        size_t node = stack.top();\n        stack.pop();\n\n        // Visit all the unvisited neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && !visited[j]) {\n            visited[j] = true;\n            stack.push(j);\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, &componentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over each node\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the node has not been visited, perform a DFS to count the number of connected components\n    if (!visited[i]) {\n      componentCount++;\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      // Perform a DFS to visit all connected nodes\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the node\n        for (size_t j = 0; j < N; ++j) {\n          // If there is an edge between the node and the neighbor and the neighbor has not been visited, push the neighbor onto the stack and mark it as visited\n          if (A[node * N + j] && !visited[j]) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      visited[i] = true;\n\n      while (!stack.empty()) {\n        // Get the current node\n        size_t node = stack.top();\n        stack.pop();\n\n        // Iterate over the neighbors of the current node\n        for (size_t j = 0; j < N; j++) {\n          // If the neighbor has not been visited and is connected to the current node, then add it to the stack\n          if (!visited[j] && A[i][j] == 1) {\n            stack.push(j);\n            visited[j] = true;\n          }\n        }\n      }\n\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  MPI_Reduce(&componentCount, nullptr, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return componentCount;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_count = 0;\n  int global_count = 0;\n  int* visited = new int[N];\n  memset(visited, 0, N * sizeof(int));\n  int my_start = rank * (N / size);\n  int my_end = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    my_end = N;\n  }\n  #pragma omp parallel for reduction(+ : local_count)\n  for (int i = my_start; i < my_end; i++) {\n    if (visited[i] == 0) {\n      std::queue<int> q;\n      q.push(i);\n      visited[i] = 1;\n      local_count++;\n      while (!q.empty()) {\n        int u = q.front();\n        q.pop();\n        for (int j = 0; j < N; j++) {\n          if (A[u][j] == 1 && visited[j] == 0) {\n            q.push(j);\n            visited[j] = 1;\n          }\n        }\n      }\n    }\n  }\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  delete[] visited;\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n\n      // Perform a depth-first search to visit all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n\n        // Mark the node as visited\n        visited[j] = true;\n\n        // Push all the unvisited neighbors of the node onto the stack\n        for (size_t k = 0; k < N; ++k) {\n          if (A[i * N + k] && !visited[k]) {\n            stack.push(k);\n          }\n        }\n      }\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all the neighbors of the node onto the stack\n          for (size_t j = 0; j < N; ++j) {\n            if (A[node * N + j] == 1) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Return the component count\n  return componentCount;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; ++i) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; ++k) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Loop over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then it is the start of a new component\n    if (!visited[i]) {\n      // Perform a depth-first search to find all the vertices in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t vertex = stack.top();\n        stack.pop();\n        if (!visited[vertex]) {\n          visited[vertex] = true;\n          // Push all the unvisited neighbors of the vertex onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[vertex * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n      // Increment the component count\n      componentCount++;\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int componentCount = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:componentCount)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      componentCount++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t node = stack.top();\n        stack.pop();\n        if (!visited[node]) {\n          visited[node] = true;\n          // Push all unvisited neighbors of the node onto the stack\n          for (size_t j = 0; j < N; j++) {\n            if (A[node * N + j] && !visited[j]) {\n              stack.push(j);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count to rank 0\n  int globalComponentCount;\n  MPI_Reduce(&componentCount, &globalComponentCount, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return globalComponentCount;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Initialize the component count\n  int count = 0;\n\n  // Iterate over the rows of the adjacency matrix\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    // If the row has not been visited, then start a new component\n    if (!visited[i]) {\n      count++;\n      // Perform a depth-first search to visit all nodes in the component\n      std::stack<size_t> stack;\n      stack.push(i);\n      while (!stack.empty()) {\n        size_t j = stack.top();\n        stack.pop();\n        if (!visited[j]) {\n          visited[j] = true;\n          // Push all unvisited neighbors of j onto the stack\n          for (size_t k = 0; k < N; k++) {\n            if (A[j * N + k] && !visited[k]) {\n              stack.push(k);\n            }\n          }\n        }\n      }\n    }\n  }\n\n  // Reduce the component count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the component count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the connected components for a subset of the rows\n  int start_row = rank * (N / size);\n  int end_row = (rank + 1) * (N / size);\n  if (rank == size - 1) {\n    end_row = N;\n  }\n\n  // Initialize the visited array\n  std::vector<bool> visited(N, false);\n\n  // Count the number of connected components\n  int count = 0;\n  for (int i = start_row; i < end_row; i++) {\n    if (!visited[i]) {\n      count++;\n      dfs(A, i, visited);\n    }\n  }\n\n  // Reduce the count across all ranks\n  int global_count;\n  MPI_Reduce(&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Return the global count on rank 0\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "&"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A represents a directed graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { edge_count += A(i, j); }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(\"edgeCount\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  Kokkos::View<int, Kokkos::LayoutRight> edgeCount(\"edgeCount\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    edgeCount(i) = 0;\n    for (size_t j = 0; j < N; ++j) { edgeCount(i) += A(i, j); }\n  });\n  int totalEdgeCount = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &total) { total += edgeCount(i); }, totalEdgeCount);\n  return totalEdgeCount;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &edgeCount) {\n    for (size_t j = 0; j < N; ++j) {\n      edgeCount += A(i, j);\n    }\n  },\n                          edgeCount);\n  return edgeCount;\n}\n",
+            "\n\n  int edge_count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A(i, j) > 0) {\n        ++edge_count;\n      }\n    }\n  });\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    for (size_t j = 0; j < N; ++j) { count += A(i, j); }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) { count += A(i, j); });\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      count += A(i, j);\n    }\n  });\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA (const int i) {\n    for (int j = 0; j < N; j++) {\n      if (A(i, j) != 0) {\n        count++;\n      }\n    }\n  });\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) { count += A[i * N + j] > 0; }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int edge_count = 0;\n#pragma omp parallel for reduction(+ : edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      edge_count += A[i * N + j];\n    }\n  }\n  return edge_count;\n}\n",
+            "\n\n  int count = 0;\n#pragma opm parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i][j] > 0) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+ : count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+            "\n\n  int count = 0;\n#pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004242406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041054082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042803194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042952176
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042182044,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042357532
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041725006,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042067848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042395068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041957078
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041917368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042433898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042904728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00431578
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042004832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004157877
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004232816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042253862
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042546246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004107829
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0043456886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041990504
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042559498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004270146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042668984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041447542
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042811532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042645878
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042131092,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004243971
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++count;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004234876,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004232303
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042038074,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042087148
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042323672,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040737568
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      count += A[i * N + j];\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042466942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041210998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        count++;\n      }\n    }\n  }\n  return count;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041744504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042541062
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      count += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint numEdgesLocal = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tnumEdgesLocal += A[tid * N + i];\n\t}\n\thipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t i = tid; i < N; i += hipBlockDim_x) {\n    for (size_t j = 0; j < N; j++) {\n      numEdgesPerThread += A[i * N + j] > 0;\n    }\n  }\n  atomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += (A[tid * N + i] != 0);\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesPerThread = 0;\n  for (size_t j = 0; j < N; j++) {\n    numEdgesPerThread += A[tid * N + j];\n  }\n  hipAtomicAdd(numEdges, numEdgesPerThread);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      count += A[tid * N + i];\n    }\n    atomicAdd(numEdges, count);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  hipAtomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int numEdgesLocal = 0;\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = 0; j < N; j++) {\n      numEdgesLocal += A[i * N + j];\n    }\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int row = tid;\n  int col = tid;\n  int count = 0;\n  while (row < N) {\n    count += A[row * N + col];\n    row += hipBlockDim_x;\n    col += hipBlockDim_x;\n  }\n  hipAtomicAdd(numEdges, count);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            ", \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2 Saxe\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] != 0) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1) {\n        ++local_count;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   Store the result in numEdges. A represents a directed graph.\n   A is an NxN adjacency matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void edgeCount(const int *A, size_t N, int *numEdges) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x;\n  int count = 0;\n  for (int col = 0; col < N; col++) {\n    count += A[row * N + col];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n  if (rowStart >= N) {\n    return;\n  }\n  int rowEnd = rowStart + blockDim.x;\n  if (rowEnd > N) {\n    rowEnd = N;\n  }\n  int count = 0;\n  for (int i = rowStart; i < rowEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        count++;\n      }\n    }\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    for (int j = 0; j < N; j++) {\n      if (A[tid * N + j] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\nint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\nif (rowStart >= N) {\n  return;\n}\nint rowEnd = rowStart + blockDim.x;\nif (rowEnd > N) {\n  rowEnd = N;\n}\nint count = 0;\nfor (int i = rowStart; i < rowEnd; ++i) {\n  for (int j = 0; j < N; ++j) {\n    count += A[i * N + j];\n  }\n}\natomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row < N && col < N) {\n    if (A[row * N + col] > 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] == 1) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] > 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row >= N) {\n    return;\n  }\n  for (int col = 0; col < N; col++) {\n    if (A[row * N + col] != 0) {\n      atomicAdd(numEdges, 1);\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; ++col) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    count += A[tid * N + i];\n  }\n  atomicAdd(numEdges, count);\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int row = blockIdx.x * blockDim.x + tid;\n  if (row < N) {\n    for (int col = 0; col < N; col++) {\n      if (A[row * N + col] != 0) {\n        atomicAdd(numEdges, 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int numEdgesLocal = 0;\n  for (size_t i = 0; i < N; i++) {\n    numEdgesLocal += A[tid * N + i];\n  }\n  atomicAdd(numEdges, numEdgesLocal);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "15_graph_edge_count",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint edgeCount(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma parallel for reduction (+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allgather(&local_count, 1, MPI_INT, &global_count, 1, MPI_INT, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int nthreads;\n  omp_set_num_threads(nthreads);\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        local_count++;\n      }\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (myrank == 0) {\n    return global_count;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_count;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count;\n\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  int global_edge_count;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_edge_count = 0;\n  #pragma omp parallel for reduction(+: local_edge_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_edge_count += A[i * N + j];\n    }\n  }\n  int global_edge_count;\n  MPI_Reduce(&local_edge_count, &global_edge_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_edge_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_count;\n  }\n  return -1;\n}\n",
+            "\n\n  int local_count = 0;\n  #pragma omp parallel for reduction(+:local_count)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_count += A[i * N + j];\n    }\n  }\n\n  int global_count;\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n",
+            "\n\n  int local_count = 0;\n  int global_count = 0;\n  #pragma omp parallel for reduction(+: local_count)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n  }\n  MPI_Reduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  return global_count;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(Kokkos::View<const int**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max = 0;\n  // Iterate over the rows of the adjacency matrix\n  for (size_t i = 0; i < N; ++i) {\n    // Iterate over the columns of the adjacency matrix\n    for (size_t j = 0; j < N; ++j) {\n      // If there is an edge between node i and node j, increment the degree of node i\n      if (A(i, j) == 1) {\n        ++max;\n      }\n    }\n  }\n  // Return the maximum degree\n  return max;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    maxDegree(i) = degree;\n  });\n  return *Kokkos::max_element(maxDegree);\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int i, int &max_degree) {\n    int degree = 0;\n    for (int j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &maxDegree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    maxDegree = Kokkos::max(maxDegree, degree);\n  });\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::View;\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, N),\n      KOKKOS_LAMBDA(const int i, int& max_degree) {\n        int degree = 0;\n        for (int j = 0; j < N; j++) {\n          degree += A(i, j);\n        }\n        max_degree = std::max(max_degree, degree);\n      },\n      max_degree);\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    if (degree > max_degree) { max_degree = degree; }\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const size_t &i, int &max_degree) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A(i, j); }\n    max_degree = Kokkos::max(max_degree, degree);\n  });\n  return max_degree;\n}\n",
+            "\n\n  Kokkos::View<int*> maxDegree(\"maxDegree\", N);\n  Kokkos::parallel_for(\"maxDegree\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A(i, j);\n    }\n    maxDegree(i) = degree;\n  });\n  int max = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (maxDegree(i) > max) {\n      max = maxDegree(i);\n    }\n  }\n  return max;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "_t>",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n#pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) { degree += A[i * N + j]; }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041294832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041418488
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_deg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041070252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004095245
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041737328,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041751358
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041458604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004145514
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041906846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041588908
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_deg = 0;\n  for (int i = 0; i < N; i++) {\n    int deg = 0;\n    for (int j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    max_deg = std::max(max_deg, deg);\n  }\n  return max_deg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0042280348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0042416426
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040607542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040693084
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040898508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040993808
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041631832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004165838
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041350142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041460288
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041593862,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041480164
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041262798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004118553
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i][j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041629004,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041647618
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041431134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041487858
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041558974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041647
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0040877168,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041009722
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) { deg += A[i * N + j]; }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  return maxDeg;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041007616,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041011884
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041009938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0040859016
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0041176636,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0041120148
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in maxDegree.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  localMax = max(localMax, *maxDegree);\n  if (threadIdx.x == 0) {\n    *maxDegree = localMax;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  atomicMax(&(*maxDegree), max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(&(*maxDegree), degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  hipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; ++i) {\n    degree += A[tid * N + i];\n  }\n  atomicAdd(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int degree = 0;\n    for (int i = 0; i < N; i++) {\n      degree += A[tid * N + i];\n    }\n    maxDegree[tid] = degree;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tint bid = hipBlockIdx_x;\n\tint row = bid * hipBlockDim_x + tid;\n\tint degree = 0;\n\tif (row < N) {\n\t\tfor (int col = 0; col < N; col++) {\n\t\t\tdegree += A[row * N + col];\n\t\t}\n\t}\n\thipAtomicMax(maxDegree, degree);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  *maxDegree = hipAtomicMax(*maxDegree, degree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n  }\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    my_max = std::max(my_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&my_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int rowSum = 0;\n    for (size_t j = 0; j < N; j++) { rowSum += A[i * N + j]; }\n    myDegree = std::max(myDegree, rowSum);\n  }\n\n  int globalDegree;\n  MPI_Allreduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] != 0) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree = 0;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDeg = 0;\n  for (size_t i = 0; i < N; i++) {\n    int deg = 0;\n    for (size_t j = 0; j < N; j++) {\n      deg += A[i * N + j];\n    }\n    maxDeg = std::max(maxDeg, deg);\n  }\n  int globalMaxDeg;\n  MPI_Allreduce(&maxDeg, &globalMaxDeg, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDeg;\n}\n",
+            "\n\n  int max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int local_degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_degree += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_degree);\n  }\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    if (degree > maxDegree) {\n      maxDegree = degree;\n    }\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (size_t i = 0; i < N; i++) {\n    int row_sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      row_sum += A[i * N + j];\n    }\n    local_max = std::max(local_max, row_sum);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int my_max_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    my_max_degree = std::max(my_max_degree, degree);\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "\n\n  int my_degree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n\n  int max_degree;\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_degree;\n}\n",
+            "Degree)",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int globalMaxDegree;\n  MPI_Allreduce(&myDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      myDegree += A[i * N + j];\n    }\n  }\n\n  int maxDegree;\n  MPI_Allreduce(&myDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  int globalMaxDegree;\n  MPI_Allreduce(&maxDegree, &globalMaxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return globalMaxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        degree++;\n      }\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &degree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  }\n  return maxDegree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph. \n   Store the result in maxDegree.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\n__global__ void maxDegree(const int *A, size_t N, int *maxDegree) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    if (localMax > *maxDegree) {\n      *maxDegree = localMax;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  if (tid == 0) {\n    *maxDegree = max;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max = 0;\n  for (int i = 0; i < N; i++) {\n    max += A[tid * N + i];\n  }\n  maxDegree[tid] = max;\n}\n",
+            "\n\n  int myMaxDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    myMaxDegree = max(myMaxDegree, degree);\n  }\n  *maxDegree = myMaxDegree;\n}\n",
+            "\n\n  int localMax = 0;\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (int j = 0; j < N; j++) {\n      localMax += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, localMax);\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint max = 0;\n\tif (tid < N) {\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax += A[tid * N + i];\n\t\t}\n\t}\n\tmaxDegree[tid] = max;\n\t__syncthreads();\n\tif (tid == 0) {\n\t\tint max_degree = 0;\n\t\tfor (int i = 0; i < N; i++) {\n\t\t\tmax_degree = max(max_degree, maxDegree[i]);\n\t\t}\n\t\t*maxDegree = max_degree;\n\t}\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  *maxDegree = max(*maxDegree, myDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = myDegree;\n    for (size_t i = 1; i < gridDim.x; i++) {\n      int otherDegree;\n      cudaMemcpy(&otherDegree, &A[i * blockDim.x * N], sizeof(int), cudaMemcpyDeviceToHost);\n      *maxDegree = max(*maxDegree, otherDegree);\n    }\n  }\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    *maxDegree = max(*maxDegree, myDegree);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int degree = 0;\n  for (int i = 0; i < N; i++) {\n    degree += A[tid * N + i];\n  }\n  maxDegree[tid] = degree;\n}\n",
+            "Degree,",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n  }\n  atomicMax(&*maxDegree, myDegree);\n}\n",
+            "\n\n\tint myMaxDegree = 0;\n\tint rowStart = blockIdx.x * blockDim.x + threadIdx.x;\n\tfor (int i = rowStart; i < N; i += blockDim.x * gridDim.x) {\n\t\tint degree = 0;\n\t\tfor (int j = 0; j < N; j++) {\n\t\t\tdegree += A[i * N + j];\n\t\t}\n\t\tmyMaxDegree = max(myMaxDegree, degree);\n\t}\n\tatomicAdd(maxDegree, myMaxDegree);\n}\n",
+            "\n\n  int myDegree = 0;\n  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n    for (size_t j = 0; j < N; j++) {\n      if (A[i * N + j] == 1) {\n        myDegree++;\n      }\n    }\n  }\n  atomicMax(maxDegree, myDegree);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "17_graph_highest_degree",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The result is returned on rank 0.\n   Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n   output: 3\n*/\nint maxDegree(std::vector<int> const& A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = N / nprocs;\n  int start_row = rank * chunk_size;\n  int end_row = start_row + chunk_size;\n  if (rank == nprocs - 1) {\n    end_row = N;\n  }\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      my_degree += A[i * N + j];\n    }\n  }\n  MPI_Allreduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_degree;\n  }\n  return -1;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_max_degree = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(max : my_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      my_max_degree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&my_max_degree, &max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_degree;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int maxDegree = 0;\n  int localMaxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(max : localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      localMaxDegree += A[i * N + j];\n    }\n  }\n\n  MPI_Allreduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int local_max_degree = 0;\n  int global_max_degree = 0;\n\n  // Compute the local maximum degree\n  #pragma omp parallel for reduction(max : local_max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    local_max_degree = std::max(local_max_degree, degree);\n  }\n\n  // Reduce the local maximum degrees to find the global maximum degree\n  MPI_Allreduce(&local_max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int localMaxDegree = 0;\n  #pragma omp parallel for reduction(max:localMaxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    localMaxDegree = std::max(localMaxDegree, degree);\n  }\n  MPI_Reduce(&localMaxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:myDegree)\n  for (size_t i = 0; i < N; ++i) {\n    int degree = 0;\n    for (size_t j = 0; j < N; ++j) {\n      degree += A[i * N + j];\n    }\n    myDegree = std::max(myDegree, degree);\n  }\n\n  int globalDegree;\n  MPI_Reduce(&myDegree, &globalDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return globalDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n#pragma omp parallel for reduction(max : maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  #pragma omp parallel for reduction(max : max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int local_max = 0;\n  for (int i = 0; i < N; i++) {\n    int local_count = 0;\n    for (int j = 0; j < N; j++) {\n      local_count += A[i * N + j];\n    }\n    local_max = std::max(local_max, local_count);\n  }\n\n  int global_max;\n  MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  #pragma omp parallel for reduction(max: maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, degree);\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  #pragma omp parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      my_degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, my_degree);\n    my_degree = 0;\n  }\n  int global_max_degree;\n  MPI_Allreduce(&max_degree, &global_max_degree, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max_degree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  return maxDegree;\n}\n",
+            "\n\n  int maxDegree = 0;\n  int myDegree = 0;\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  #pragma omp parallel for reduction(max:maxDegree)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      myDegree += A[i * N + j];\n    }\n    maxDegree = std::max(maxDegree, myDegree);\n    myDegree = 0;\n  }\n  MPI_Reduce(&maxDegree, &maxDegree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return maxDegree;\n}\n",
+            "\n  int max_degree = 0;\n  #pragma opm parallel for reduction(max:max_degree)\n  for (size_t i = 0; i < N; i++) {\n    int degree = 0;\n    for (size_t j = 0; j < N; j++) {\n      degree += A[i * N + j];\n    }\n    max_degree = std::max(max_degree, degree);\n  }\n  return max_degree;\n}\n",
+            "\n\n  int max_degree = 0;\n  int my_degree = 0;\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int num_threads;\n  omp_set_num_threads(4);\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:my_degree)\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        my_degree += A[i * N + j];\n      }\n    }\n  }\n  MPI_Reduce(&my_degree, &max_degree, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return max_degree;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix. A is an undirected graph.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(Kokkos::View<const int**> &A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<int*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = 0;\n  });\n\n  dist[source] = 0;\n  visited[source] = 1;\n\n  int minDist = INT_MAX;\n\n  while (visited[dest] == 0) {\n    int minDistNode = -1;\n    for (int i = 0; i < N; i++) {\n      if (visited[i] == 0 && (minDistNode == -1 || dist[i] < dist[minDistNode])) {\n        minDistNode = i;\n      }\n    }\n\n    if (minDistNode == -1) {\n      break;\n    }\n\n    visited[minDistNode] = 1;\n\n    for (int i = 0; i < N; i++) {\n      if (A(minDistNode, i) == 1 && visited[i] == 0) {\n        if (dist[i] == -1 || dist[i] > dist[minDistNode] + 1) {\n          dist[i] = dist[minDistNode] + 1;\n        }\n      }\n    }\n  }\n\n  if (dist[dest] == -1) {\n    return -1;\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::parallel_for(\"bfs\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    visited[i] = false;\n  });\n  visited[source] = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  int head = 0;\n  int tail = 0;\n  queue[tail++] = source;\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int current = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of the current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances[i] == std::numeric_limits<int>::max()) {\n        distances[i] = distances[current] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return distances[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n  return dist(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int *> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = INT_MAX; });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Queue of nodes to visit\n  std::queue<int> q;\n  q.push(source);\n\n  // While there are nodes to visit\n  while (!q.empty()) {\n    // Get the current node\n    int current = q.front();\n    q.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // Visit all neighbors of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If there is no path from the source to the destination, return -1\n  return -1;\n}\n",
+            "\n\n  using TeamPolicy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::StaticTeamSize<256>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  Kokkos::View<int*> visited(\"visited\", N);\n  Kokkos::View<int*> distance(\"distance\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  visited[source] = 1;\n  distance[source] = 0;\n  queue[0] = source;\n\n  int head = 0;\n  int tail = 1;\n\n  while (head < tail) {\n    int u = queue[head++];\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = 1;\n        distance[v] = distance[u] + 1;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return distance[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) { distances[i] = std::numeric_limits<int>::max(); });\n\n  // Set the distance of the source to 0\n  distances[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> q;\n  q.push(source);\n\n  // While the queue is not empty\n  while (!q.empty()) {\n    // Get the current vertex\n    int current = q.front();\n    q.pop();\n\n    // If the current vertex is the destination, return the distance\n    if (current == dest) {\n      return distances[current];\n    }\n\n    // For each neighbor of the current vertex\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current vertex and the neighbor\n      if (A(current, i) == 1) {\n        // If the distance to the neighbor is greater than the distance to the current vertex plus 1\n        if (distances[i] > distances[current] + 1) {\n          // Update the distance to the neighbor\n          distances[i] = distances[current] + 1;\n\n          // Add the neighbor to the queue\n          q.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  size_t head = 0;\n  size_t tail = 0;\n  queue[tail++] = source;\n\n  while (head != tail) {\n    int u = queue[head++];\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        queue[tail++] = v;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<bool*> queue(\"queue\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist(i) = -1;\n    visited(i) = false;\n    queue(i) = false;\n  });\n\n  dist(source) = 0;\n  queue(source) = true;\n\n  while (true) {\n    int min_dist = -1;\n    int min_idx = -1;\n    for (int i = 0; i < N; i++) {\n      if (queue(i) && (min_dist == -1 || dist(i) < min_dist)) {\n        min_dist = dist(i);\n        min_idx = i;\n      }\n    }\n\n    if (min_idx == -1) {\n      break;\n    }\n\n    queue(min_idx) = false;\n    visited(min_idx) = true;\n\n    for (int i = 0; i < N; i++) {\n      if (A(min_idx, i) > 0 && !visited(i)) {\n        if (dist(i) == -1 || dist(i) > dist(min_idx) + 1) {\n          dist(i) = dist(min_idx) + 1;\n          queue(i) = true;\n        }\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) && dist[i] == -1) {\n        dist[i] = dist[u] + 1;\n        q.push(i);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> distance = Kokkos::View<int*>(\"distance\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) { distance[i] = -1; });\n  distance[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && distance[v] == -1) {\n        distance[v] = distance[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return distance[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = INT_MAX;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n        q.push(v);\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::View<bool*> visited(\"visited\", N);\n  Kokkos::View<int*> queue(\"queue\", N);\n  Kokkos::View<int*> queue_head(\"queue_head\", 1);\n  Kokkos::View<int*> queue_tail(\"queue_tail\", 1);\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int &i) {\n    dist[i] = -1;\n    visited[i] = false;\n  });\n\n  dist[source] = 0;\n  visited[source] = true;\n  queue_head[0] = 0;\n  queue_tail[0] = 1;\n  queue[0] = source;\n\n  while (queue_head[0] < queue_tail[0]) {\n    int u = queue[queue_head[0]];\n    queue_head[0]++;\n    for (int v = 0; v < N; v++) {\n      if (A(u, v) && !visited[v]) {\n        visited[v] = true;\n        dist[v] = dist[u] + 1;\n        queue[queue_tail[0]] = v;\n        queue_tail[0]++;\n      }\n    }\n  }\n\n  return dist[dest];\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  dist() = -1;\n  dist(source) = 0;\n\n  Kokkos::View<bool*> visited(\"visited\", N);\n  visited() = false;\n  visited(source) = true;\n\n  Kokkos::View<int*> queue(\"queue\", N);\n  queue() = -1;\n  int head = 0;\n  int tail = 0;\n  queue(tail++) = source;\n\n  while (head != tail) {\n    int u = queue(head++);\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && !visited(v)) {\n        dist(v) = dist(u) + 1;\n        visited(v) = true;\n        queue(tail++) = v;\n      }\n    }\n  }\n\n  return dist(dest);\n}\n",
+            "\n\n  Kokkos::View<int*> dist(\"dist\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    dist[i] = -1;\n  });\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A(u, v) && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n  distances(source) = 0;\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t i = 0; i < N; i++) {\n      if (A(u, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        q.push(i);\n        distances(i) = distances(u) + 1;\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n",
+            "\n\n  // Initialize distances to infinity\n  Kokkos::View<int*> distances(\"distances\", N);\n  distances() = std::numeric_limits<int>::max();\n\n  // Initialize queue with source node\n  std::queue<int> queue;\n  queue.push(source);\n  distances(source) = 0;\n\n  // Perform BFS\n  while (!queue.empty()) {\n    int current = queue.front();\n    queue.pop();\n\n    // Iterate over neighbors of current node\n    for (size_t i = 0; i < N; i++) {\n      if (A(current, i) == 1 && distances(i) == std::numeric_limits<int>::max()) {\n        distances(i) = distances(current) + 1;\n        queue.push(i);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return distances(dest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use OpenMP to compute in parallel.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] > dist[u] + 1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize the distance to all nodes to infinity\n  std::vector<int> distance(N, INT_MAX);\n\n  // Set the distance to the source node to 0\n  distance[source] = 0;\n\n  // Create a queue to store the nodes to be visited\n  std::queue<int> queue;\n\n  // Enqueue the source node\n  queue.push(source);\n\n  // While there are still nodes to be visited\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // Visit all the neighbors of the current node\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current][i] > 0) {\n        // If the distance to the neighbor is greater than the distance to the current node plus the weight of the edge\n        if (distance[i] > distance[current] + A[current][i]) {\n          // Update the distance to the neighbor\n          distance[i] = distance[current] + A[current][i];\n\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // Return the distance to the destination node\n  return distance[dest];\n}\n",
+            "\n\n  int dist[N];\n  std::fill(dist, dist + N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    if (u == dest) {\n      return dist[u];\n    }\n\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int min_dist = INT_MAX;\n  std::vector<int> dist(N, INT_MAX);\n  std::vector<bool> visited(N, false);\n\n  dist[source] = 0;\n  visited[source] = true;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n        visited[i] = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t k = 0; k < N; k++) {\n      for (size_t i = 0; i < N; i++) {\n        for (size_t j = 0; j < N; j++) {\n          if (A[i][k] == 1 && A[k][j] == 1 && dist[i] != INT_MAX && dist[j] > dist[i] + 1) {\n            dist[j] = dist[i] + 1;\n            visited[j] = true;\n          }\n        }\n      }\n    }\n  }\n\n  min_dist = dist[dest];\n  return min_dist == INT_MAX ? -1 : min_dist;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Queue for BFS\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return distance to destination\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the distance to the destination\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> visited(N, 0);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    if (visited[node] == 1) {\n      continue;\n    }\n    visited[node] = 1;\n    for (size_t i = 0; i < N; i++) {\n      if (A[node][i] == 1) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  visited[source] = true;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && !visited[v]) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n        visited[v] = true;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) return dist;\n    if (visited[node]) continue;\n    visited[node] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) {\n      return dist[u];\n    }\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n        if (v == dest) {\n          return dist[v];\n        }\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "=",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    if (u == dest) return dist[u];\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == -1) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> dist(N, -1);\n  std::queue<int> q;\n  q.push(source);\n  dist[source] = 0;\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (size_t v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == -1) {\n        q.push(v);\n        dist[v] = dist[u] + 1;\n      }\n    }\n  }\n  return dist[dest];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push(std::make_pair(source, 0));\n  while (!q.empty()) {\n    std::pair<int, int> p = q.front();\n    q.pop();\n    if (p.first == dest) {\n      return p.second;\n    }\n    if (visited[p.first]) {\n      continue;\n    }\n    visited[p.first] = true;\n    for (size_t i = 0; i < N; ++i) {\n      if (A[p.first][i] == 1) {\n        q.push(std::make_pair(i, p.second + 1));\n      }\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<bool> visited(N, false);\n  std::queue<std::pair<int, int>> q;\n  q.push({source, 0});\n  visited[source] = true;\n\n  while (!q.empty()) {\n    auto [node, dist] = q.front();\n    q.pop();\n    if (node == dest) {\n      return dist;\n    }\n    for (int i = 0; i < N; ++i) {\n      if (A[node][i] && !visited[i]) {\n        q.push({i, dist + 1});\n        visited[i] = true;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n\n  __syncthreads();\n\n  for (int i = 0; i < N; i++) {\n    if (pathLength[bid] >= 0 && A[tid * N + i] == 1) {\n      if (i == dest) {\n        pathLength[bid] = pathLength[bid] + 1;\n        break;\n      } else if (pathLength[i * N + dest] == -1 ||\n                 pathLength[i * N + dest] > pathLength[bid] + 1) {\n        pathLength[i * N + dest] = pathLength[bid] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint distance = INT_MAX;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid * N + k] == 1) {\n\t\t\tif (distance + 1 < pathLength[k]) {\n\t\t\t\tpathLength[k] = distance + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n  int n = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\n  int m = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;\n  if (n == source) {\n    pathLength[n] = 0;\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        pathLength[i] = 1;\n      }\n    }\n  } else {\n    __syncthreads();\n    for (int i = 0; i < N; ++i) {\n      if (A[n * N + i] == 1) {\n        if (pathLength[i] != -1) {\n          pathLength[n] = pathLength[i] + 1;\n        }\n      }\n    }\n  }\n  __syncthreads();\n  if (n == dest) {\n    *pathLength = pathLength[n];\n  }\n}\n",
+            "\n\n  int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int tid_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; k++) {\n    if (A[tid * N + k] == 1 && tid != k) {\n      if (pathLength[tid] == -1 || pathLength[k] + 1 < pathLength[tid]) {\n        pathLength[tid] = pathLength[k] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n      if (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n        pathLength[i] = pathLength[tid] + 1;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n  if (tid == source) {\n    pathLength[bid] = 0;\n  } else {\n    pathLength[bid] = -1;\n  }\n  __syncthreads();\n  int minPath = pathLength[bid];\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && pathLength[i] != -1) {\n      minPath = min(minPath, pathLength[i] + 1);\n    }\n  }\n  pathLength[bid] = minPath;\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = minPath;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source && bid == 0) {\n\t\t*pathLength = 0;\n\t} else if (tid == dest && bid == N - 1) {\n\t\t*pathLength = -1;\n\t} else {\n\t\t*pathLength = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (bid + 1 < N) {\n\t\t\t\tint next = i + (bid + 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t\tif (bid - 1 >= 0) {\n\t\t\t\tint next = i + (bid - 1) * N;\n\t\t\t\tatomicMin(&pathLength[next], *pathLength + 1);\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int tid_y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (tid == source) {\n    pathLength[tid] = 0;\n  } else {\n    pathLength[tid] = -1;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (pathLength[tid] == -1) {\n        pathLength[tid] = pathLength[i] + 1;\n      } else {\n        pathLength[tid] = min(pathLength[tid], pathLength[i] + 1);\n      }\n    }\n  }\n  __syncthreads();\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1) {\n\t\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i == source) {\n    pathLength[j] = 1;\n  } else {\n    pathLength[j] = -1;\n  }\n  __syncthreads();\n  for (int k = 0; k < N; ++k) {\n    if (pathLength[k] == -1) {\n      continue;\n    }\n    if (A[i * N + k] == 1) {\n      if (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n        pathLength[j] = pathLength[k] + 1;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tidx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint tridy = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n\tint i = tidx;\n\tint j = tridy;\n\tif (i == source) {\n\t\tpathLength[i] = 0;\n\t} else {\n\t\tpathLength[i] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (tid == source && bid == dest) {\n    *pathLength = 0;\n  } else if (tid == dest && bid == source) {\n    *pathLength = 0;\n  } else if (A[bid * N + tid] == 1) {\n    *pathLength = 1;\n  } else {\n    *pathLength = -1;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] && distance + 1 < pathLength[i]) {\n      pathLength[i] = distance + 1;\n    }\n  }\n  __syncthreads();\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int distance = INT_MAX;\n  if (tid == source) {\n    distance = 0;\n  }\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    if (A[tid * N + i] == 1 && tid != i) {\n      if (distance + 1 < distance) {\n        distance = distance + 1;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == dest) {\n    *pathLength = distance;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "=  Synagoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesoguesogues Synag Synag Synag Synagoguesoguesoguesoguesogues",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tint bid = hipBlockIdx_y * hipBlockDim_y + hipBlockIdx_x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform relaxation\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> gathered_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, gathered_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, gathered_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length on rank 0\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all neighbors of u\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length from source to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distance to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If we have not reached the destination, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // If u is the destination, return the distance\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // If no path exists, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int dist[N];\n  for (int i = 0; i < N; i++) {\n    dist[i] = INT_MAX;\n  }\n  dist[source] = 0;\n\n  int next[N];\n  for (int i = 0; i < N; i++) {\n    next[i] = -1;\n  }\n\n  int visited[N];\n  for (int i = 0; i < N; i++) {\n    visited[i] = false;\n  }\n\n  int u;\n  while (true) {\n    u = -1;\n    for (int i = 0; i < N; i++) {\n      if (!visited[i] && (u == -1 || dist[i] < dist[u])) {\n        u = i;\n      }\n    }\n    if (u == -1 || dist[u] == INT_MAX) {\n      break;\n    }\n    visited[u] = true;\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] != 0 && dist[u] + 1 < dist[v]) {\n        dist[v] = dist[u] + 1;\n        next[v] = u;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    return dist[dest];\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * num_ranks);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (my_rank == 0) {\n    for (int i = 0; i < N * num_ranks; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> dist(N, -1);\n  dist[source] = 0;\n\n  std::queue<int> q;\n  q.push(source);\n\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    for (int i = 0; i < N; i++) {\n      if (A[curr][i] == 1 && dist[i] == -1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (u == dest) {\n      return dist[u];\n    }\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return shortest path length if destination is reachable\n  if (dist[dest] != INT_MAX) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Iterate over all vertices\n  for (int i = 0; i < N; ++i) {\n    // Each rank computes the shortest path from source to all other vertices\n    for (int j = 0; j < N; ++j) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n\n    // Gather the distances from all ranks\n    std::vector<int> all_dist(N * num_ranks);\n    MPI_Allgather(dist.data(), N, MPI_INT, all_dist.data(), N, MPI_INT, MPI_COMM_WORLD);\n\n    // Update the distances on each rank\n    for (int j = 0; j < N; ++j) {\n      for (int k = 0; k < num_ranks; ++k) {\n        if (all_dist[k * N + j] < dist[j]) {\n          dist[j] = all_dist[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length on rank 0\n  if (my_rank == 0) {\n    return dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS\n  std::queue<int> q;\n  q.push(source);\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n    for (int v = 0; v < N; v++) {\n      if (A[u * N + v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == std::numeric_limits<int>::max()) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Store the result in pathLength.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\n__global__ void shortestPathLength(const int *A, size_t N, int source, int dest, int *pathLength) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\n\tint count = 0;\n\twhile (pathLength[dest] == -1) {\n\t\t__syncthreads();\n\t\tif (pathLength[tid] == count) {\n\t\t\tfor (int i = 0; i < N; i++) {\n\t\t\t\tif (A[tid * N + i] && pathLength[i] == -1) {\n\t\t\t\t\tpathLength[i] = count + 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcount++;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tif (pathLength[j] == -1) {\n\t\tfor (int k = 0; k < N; k++) {\n\t\t\tif (A[i * N + k] && pathLength[k] != -1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] == 1 && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "_t = 0, 1, 2, 3",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\n\tif (i == source && j == dest) {\n\t\t*pathLength = 0;\n\t} else if (A[i * N + j] == 1) {\n\t\t*pathLength = 1;\n\t} else {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t}\n\telse {\n\t\tpathLength[j] = INT_MAX;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] == 1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source && j == dest) {\n\t\t*pathLength = 1;\n\t}\n\telse if (A[i * N + j] == 1) {\n\t\t*pathLength = 2;\n\t}\n\telse {\n\t\t*pathLength = -1;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1 && A[tid * N + i] != 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t\t__syncthreads();\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t} else {\n\t\tpathLength[tid_col] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid_row] > 0 && A[tid_row * N + i] > 0) {\n\t\t\tif (pathLength[i] == -1 || pathLength[i] > pathLength[tid_row] + 1) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && tid_row != tid_col) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[i * N + k] && pathLength[i] != -1) {\n\t\t\tif (pathLength[k] == -1 || pathLength[k] > pathLength[i] + 1) {\n\t\t\t\tpathLength[k] = pathLength[i] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (pathLength[tid] != -1) {\n\t\t\tif (A[tid * N + i] == 1 && pathLength[i] == -1) {\n\t\t\t\tpathLength[i] = pathLength[tid] + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n}\n",
+            "\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = (A[i * N + j] == 1) ? 1 : INT_MAX;\n\t} else {\n\t\tpathLength[j] = INT_MAX;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] != INT_MAX && A[k * N + j] == 1) {\n\t\t\tpathLength[j] = min(pathLength[j], pathLength[k] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\n\tif (tid == source) {\n\t\tpathLength[tid] = 0;\n\t} else {\n\t\tpathLength[tid] = -1;\n\t}\n\n\t__syncthreads();\n\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid_row * N + i] == 1 && pathLength[tid_row] != -1) {\n\t\t\tif (pathLength[i] == -1 || pathLength[tid_row] + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = pathLength[tid_row] + 1;\n\t\t\t}\n\t\t}\n\t}\n\n\t__syncthreads();\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint distance = 0;\n\tif (tid == source) {\n\t\tdistance = 0;\n\t}\n\t__syncthreads();\n\tfor (int i = 0; i < N; i++) {\n\t\tif (A[tid * N + i] == 1) {\n\t\t\tif (distance + 1 < pathLength[i]) {\n\t\t\t\tpathLength[i] = distance + 1;\n\t\t\t}\n\t\t}\n\t}\n\t__syncthreads();\n\tif (tid == dest) {\n\t\t*pathLength = distance;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = 1;\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (A[k * N + j] == 1) {\n\t\t\tif (pathLength[j] == -1 || pathLength[j] > pathLength[k] + 1) {\n\t\t\t\tpathLength[j] = pathLength[k] + 1;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tint tid_row = tid / N;\n\tint tid_col = tid % N;\n\tif (tid_row == source) {\n\t\tpathLength[tid_col] = 1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (A[tid_row * N + k] && pathLength[tid_row] > 0) {\n\t\t\tpathLength[tid_col] = min(pathLength[tid_col], pathLength[tid_row] + 1);\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tidx = threadIdx.x + blockIdx.x * blockDim.x;\n\tint tidy = threadIdx.y + blockIdx.y * blockDim.y;\n\tif (tidx == source && tidx == tidy) {\n\t\tpathLength[tidx] = 0;\n\t} else if (tidx == tidy) {\n\t\tpathLength[tidx] = -1;\n\t} else if (A[tidx * N + tidy] == 1) {\n\t\tpathLength[tidx] = 1;\n\t} else {\n\t\tpathLength[tidx] = -1;\n\t}\n\t__syncthreads();\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[tidx] == -1 || pathLength[tidy] == -1) {\n\t\t\tcontinue;\n\t\t}\n\t\tif (pathLength[tidx] + pathLength[tidy] + 1 < pathLength[k]) {\n\t\t\tpathLength[k] = pathLength[tidx] + pathLength[tidy] + 1;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tint i = tid / N;\n\tint j = tid % N;\n\tif (i == source) {\n\t\tpathLength[j] = A[i * N + j];\n\t} else {\n\t\tpathLength[j] = -1;\n\t}\n\t__syncthreads();\n\tint minPath = pathLength[j];\n\tfor (int k = 0; k < N; k++) {\n\t\tif (pathLength[k] >= 0 && A[j * N + k] > 0) {\n\t\t\tint newPath = pathLength[k] + A[j * N + k];\n\t\t\tif (newPath < minPath) {\n\t\t\t\tminPath = newPath;\n\t\t\t}\n\t\t}\n\t}\n\tpathLength[j] = minPath;\n\t__syncthreads();\n\tif (j == dest) {\n\t\t*pathLength = pathLength[j];\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "graph",
+        "language": "cpp",
+        "name": "19_graph_shortest_path",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n   A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. The shortest path length is returned on rank 0.\n   Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n   output: 2\n*/\nint shortestPathLength(std::vector<int> const& A, size_t N, int source, int dest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Initialize the distance array to all infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize the queue with the source node\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform a breadth-first search to find the shortest path\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all neighbors of u\n    for (int v = 0; v < N; v++) {\n      if (A[u][v] == 1 && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to the destination\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallel BFS\n  #pragma omp parallel\n  {\n    while (true) {\n      bool updated = false;\n\n      // Relax all edges in parallel\n      #pragma omp for\n      for (size_t i = 0; i < N; ++i) {\n        for (size_t j = 0; j < N; ++j) {\n          if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n            dist[j] = dist[i] + 1;\n            updated = true;\n          }\n        }\n      }\n\n      // Check if any rank has updated distances\n      int global_updated;\n      MPI_Allreduce(&updated, &global_updated, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n      // If no rank has updated distances, break\n      if (!global_updated) {\n        break;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length on rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> recv_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &recv_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return recv_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Return the shortest path length\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length from rank 0\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Use OpenMP for parallelization within each rank\n  #pragma omp parallel\n  {\n    // Perform BFS using a queue\n    std::queue<int> q;\n    q.push(source);\n\n    while (!q.empty()) {\n      int u = q.front();\n      q.pop();\n\n      // For each neighbor of u\n      for (int v = 0; v < N; ++v) {\n        if (A[u][v] && dist[v] == INT_MAX) {\n          dist[v] = dist[u] + 1;\n          q.push(v);\n        }\n      }\n    }\n  }\n\n  // Gather the distances from all ranks to rank 0\n  std::vector<int> all_dist(N * MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&dist[0], N, MPI::INT, &all_dist[0], N, MPI::INT);\n\n  // Find the minimum distance to dest\n  int min_dist = INT_MAX;\n  for (int i = 0; i < N * MPI::COMM_WORLD.Get_size(); ++i) {\n    if (all_dist[i] != INT_MAX && i / N == dest) {\n      min_dist = std::min(min_dist, all_dist[i]);\n    }\n  }\n\n  // Return the minimum distance\n  return min_dist;\n}\n",
+            "\n\n  // Initialize the distance to the source to 0 and all other distances to infinity\n  std::vector<int> distance(N, std::numeric_limits<int>::max());\n  distance[source] = 0;\n\n  // Initialize the queue with the source\n  std::queue<int> queue;\n  queue.push(source);\n\n  // While the queue is not empty\n  while (!queue.empty()) {\n    // Dequeue the next node\n    int current = queue.front();\n    queue.pop();\n\n    // If the current node is the destination, return the distance\n    if (current == dest) {\n      return distance[current];\n    }\n\n    // For each neighbor of the current node\n    for (int i = 0; i < N; i++) {\n      // If there is an edge between the current node and the neighbor\n      if (A[current * N + i] == 1) {\n        // If the distance to the neighbor is greater than the distance to the current node plus 1\n        if (distance[i] > distance[current] + 1) {\n          // Set the distance to the neighbor to the distance to the current node plus 1\n          distance[i] = distance[current] + 1;\n          // Enqueue the neighbor\n          queue.push(i);\n        }\n      }\n    }\n  }\n\n  // If the destination is not reachable, return -1\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, std::numeric_limits<int>::max());\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] == 1 && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> global_dist(N * size);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length\n  int shortest_path_length = std::numeric_limits<int>::max();\n  if (rank == 0) {\n    for (int i = 0; i < N * size; i++) {\n      if (i % N == dest) {\n        shortest_path_length = std::min(shortest_path_length, global_dist[i]);\n      }\n    }\n  }\n\n  // Broadcast the shortest path length\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    for (int i = tid; i < N; i += num_threads) {\n      for (int j = 0; j < N; j++) {\n        if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n          dist[j] = dist[i] + 1;\n        }\n      }\n    }\n  }\n\n  // Gather distances from all ranks to rank 0\n  std::vector<int> global_dist(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, global_dist.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, global_dist[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Visit all adjacent vertices\n    #pragma omp parallel for\n    for (int v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length\n  return dist[dest];\n}\n",
+            "\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int u = q.front();\n    q.pop();\n\n    // Iterate over all adjacent vertices\n    for (size_t v = 0; v < N; ++v) {\n      if (A[u][v] && dist[v] == INT_MAX) {\n        dist[v] = dist[u] + 1;\n        q.push(v);\n      }\n    }\n  }\n\n  // Return the shortest path length to dest\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Initialize queue with source\n  std::queue<int> q;\n  q.push(source);\n\n  // Perform BFS\n  while (!q.empty()) {\n    int curr = q.front();\n    q.pop();\n\n    // Check if we have reached the destination\n    if (curr == dest) {\n      return dist[curr];\n    }\n\n    // Relax neighbors\n    #pragma omp parallel for\n    for (int i = 0; i < N; i++) {\n      if (A[curr * N + i] && dist[i] > dist[curr] + 1) {\n        dist[i] = dist[curr] + 1;\n        q.push(i);\n      }\n    }\n  }\n\n  // No path found\n  return -1;\n}\n",
+            "\n\n  // Initialize the distance array.\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          dist[k] = dist[j] + A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Check for negative-weight cycles.\n  for (int i = 0; i < N - 1; i++) {\n#pragma omp parallel for\n    for (int j = 0; j < N; j++) {\n      for (int k = 0; k < N; k++) {\n        if (A[j * N + k] != 0 && dist[j] + A[j * N + k] < dist[k]) {\n          return -1;\n        }\n      }\n    }\n  }\n\n  // Return the shortest path length.\n  return dist[dest];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform BFS using MPI and OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < N; i++) {\n      if (A[source][i] == 1) {\n        dist[i] = 1;\n      }\n    }\n\n    #pragma omp barrier\n\n    for (int j = 1; j < N; j++) {\n      #pragma omp for\n      for (int i = 0; i < N; i++) {\n        if (dist[i] == j) {\n          #pragma omp for\n          for (int k = 0; k < N; k++) {\n            if (A[i][k] == 1 && dist[k] > j + 1) {\n              dist[k] = j + 1;\n            }\n          }\n        }\n      }\n\n      #pragma omp barrier\n    }\n  }\n\n  // Gather distances on rank 0\n  std::vector<int> global_dist(N);\n  MPI_Gather(&dist[0], N, MPI_INT, &global_dist[0], N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Return the shortest path length\n  if (rank == 0) {\n    return global_dist[dest];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize distances to infinity\n  std::vector<int> dist(N, INT_MAX);\n  dist[source] = 0;\n\n  // Perform relaxation in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      if (A[i * N + j] && dist[i] + 1 < dist[j]) {\n        dist[j] = dist[i] + 1;\n      }\n    }\n  }\n\n  // Gather distances to rank 0\n  std::vector<int> dist_gathered(N * nprocs);\n  MPI_Gather(dist.data(), N, MPI_INT, dist_gathered.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the shortest path length on rank 0\n  int shortest_path_length = INT_MAX;\n  if (rank == 0) {\n    for (int i = 0; i < N * nprocs; i += N) {\n      shortest_path_length = std::min(shortest_path_length, dist_gathered[i + dest]);\n    }\n  }\n\n  // Broadcast the shortest path length to all ranks\n  MPI_Bcast(&shortest_path_length, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  return shortest_path_length;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n\n  return min_odd;\n}\n",
+            "&",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n  return min_odd;\n}\n",
+            "\n\n  int result = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 1 && x(i) < result) {\n      result = x(i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    y(i) = (x(i) % 2 == 1) ? x(i) : std::numeric_limits<int>::max();\n  });\n  int min = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, int& min) {\n    min = std::min(min, y(i));\n  });\n  return min;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int n = x.extent(0);\n  View<int*, device<Kokkos::DefaultExecutionSpace>> y(\"y\", n);\n\n  parallel_for(\"smallestOdd\", n, KOKKOS_LAMBDA(const int i) {\n    if (i % 2 == 1) {\n      y(i) = x(i);\n    } else {\n      y(i) = std::numeric_limits<int>::max();\n    }\n  });\n\n  int min_odd = std::numeric_limits<int>::max();\n  for (int i = 0; i < n; ++i) {\n    min_odd = std::min(min_odd, y(i));\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int min_odd = INT_MAX;\n\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = Kokkos::parallel_reduce(Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1) {\n      min_odd = std::min(min_odd, x(i));\n    }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto min_odd = INT_MAX;\n  Kokkos::parallel_reduce(policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) {\n      min_odd = x(i);\n    }\n  });\n  return min_odd;\n}\n",
+            "\n\n  int smallest = Kokkos::Max<int>();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n  return smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, int& min_odd) {\n    if (x(i) % 2 == 1 && x(i) < min_odd) { min_odd = x(i); }\n  }, min_odd);\n  return min_odd;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  int smallest = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const int& i, int& smallest) {\n    if (x(i) % 2 == 1 && x(i) < smallest) {\n      smallest = x(i);\n    }\n  });\n\n  return smallest;\n}\n",
+            "\n\n  int min_odd = std::numeric_limits<int>::max();\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& min_odd) {\n                            if (x(i) % 2 == 1 && x(i) < min_odd) {\n                              min_odd = x(i);\n                            }\n                          });\n  return min_odd;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n\n  int min_odd = std::numeric_limits<int>::max();\n\n  Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& min_odd) {\n        if (x(i) % 2 == 1 && x(i) < min_odd) {\n          min_odd = x(i);\n        }\n      },\n      min_odd);\n\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int minOdd = INT_MAX;\n  #pragma omp parallel for reduction(min:minOdd)\n  for (int i = 0; i < static_cast<int>(x.size()); ++i) {\n    if (x[i] % 2 != 0 && x[i] < minOdd) {\n      minOdd = x[i];\n    }\n  }\n  return minOdd;\n}\n",
+            ": 1",
+            "\n\n  int min_odd = INT_MAX;\n#pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      min_odd = std::min(min_odd, x[i]);\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n#pragma omp parallel for reduction(min:smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel for reduction(min:smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  #pragma omp parallel for reduction(min:min_odd)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the value of the smallest odd number in the vector x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0805771958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0862201936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0801105336,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0865101576
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079809609,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0860471782
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 != 0) {\n      return x[i];\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.08050712,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0863949022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080480179,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0864053804
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080359999,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0861126774
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080264363,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0859405794
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804303514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0862944458
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0808032874,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0864044446
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079599708,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0854767906
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079417307,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0852425096
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0804740976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.086090099
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079414658,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0851227062
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079462935,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0851185204
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079443372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0851286654
+                    }
+                ]
+            },
+            {
+                "generated_output": "smallest&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int smallest = -1;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && (smallest == -1 || x[i] < smallest)) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0803027974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.083250158
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.080503535,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0863709396
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0807565434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0861777056
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use AMD HIP to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  __syncthreads();\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n  __syncthreads();\n  if (smallest_local % 2 == 0) {\n    return;\n  }\n  if (smallest_local < *smallest) {\n    atomicMin(smallest, smallest_local);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n\n  // Find the smallest odd number in the local workgroup\n  __shared__ int smallest_shared;\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n\n  // Reduce the local smallest values to a single global smallest value\n  __syncthreads();\n  if (tid == 0) {\n    smallest_shared = smallest_local;\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      if (smallest_shared > smallest_local) {\n        smallest_shared = smallest_local;\n      }\n    }\n    *smallest = smallest_shared;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < my_smallest) {\n      my_smallest = x[i];\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  for (auto v : x) {\n    if (v % 2 == 1) {\n      my_smallest = std::min(my_smallest, v);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int smallest = INT_MAX;\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest) {\n      smallest = x[i];\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int min_odd = INT_MAX;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < min_odd) {\n      min_odd = x[i];\n    }\n  }\n  return min_odd;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the value of the smallest odd number in the vector x. Store it in smallest.\n   Use CUDA to compute in parallel. The kernel is launched with the same number of threads as elements in x.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\n__global__ void smallestOdd(const int *x, size_t N, int *smallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 1) {\n    *smallest = min(*smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (threadIdx.x == 0) {\n        *smallest = x[tid];\n      }\n      __syncthreads();\n      if (threadIdx.x == 0) {\n        *smallest = min(*smallest, x[tid]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      if (x[tid] < *smallest) {\n        *smallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    atomicMin(smallest, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = start + block_size;\n  int min = INT_MAX;\n  for (int i = start; i < end && i < N; i++) {\n    if (x[i] % 2 == 1 && x[i] < min) {\n      min = x[i];\n    }\n  }\n  smallest[tid] = min;\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < block_size; i++) {\n      if (smallest[i] < smallest[0]) {\n        smallest[0] = smallest[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] < smallest_local && x[i] % 2 == 1) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 1) {\n    if (tid == 0 || x[tid] < *smallest) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int smallest_local = x[tid];\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    if (x[i] % 2 == 1 && x[i] < smallest_local) {\n      smallest_local = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *smallest = smallest_local;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid >= N) {\n    return;\n  }\n  if (x[tid] % 2 == 1) {\n    if (tid == 0) {\n      *smallest = x[tid];\n    } else {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      atomicMin(smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1 && (x[tid] < *smallest || *smallest == -1)) {\n      *smallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      *smallest = min(*smallest, x[tid]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 1) {\n      if (tid == 0) {\n        *smallest = x[tid];\n      } else {\n        if (x[tid] < *smallest) {\n          *smallest = x[tid];\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "28_reduce_smallest_odd_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the value of the smallest odd number in the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Examples:\n\n   input: [7, 9, 5, 2, 8, 16, 4, 1]\n   output: 1\n\n   input: [8, 36, 7, 2, 11]\n   output: 7\n*/\nint smallestOdd(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n\n  int smallest;\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n",
+            "\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int result = INT_MAX;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < result) {\n      result = x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  #pragma omp parallel\n  {\n    int local_smallest = INT_MAX;\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 1 && x[i] < local_smallest) {\n        local_smallest = x[i];\n      }\n    }\n    #pragma omp critical\n    if (local_smallest < smallest) {\n      smallest = local_smallest;\n    }\n  }\n  return smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Reduce to find the global minimum\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  // Find the smallest odd number on each rank\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int smallest_odd = INT_MAX;\n  #pragma omp parallel for reduction(min: smallest_odd)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < smallest_odd) {\n      smallest_odd = x[i];\n    }\n  }\n  return smallest_odd;\n}\n",
+            "\n\n  int n = x.size();\n  int local_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on each rank\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min:my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min: local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int local_min = std::numeric_limits<int>::max();\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 != 0) {\n      local_min = std::min(local_min, x[i]);\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  int global_min = INT_MAX;\n\n  // Find the smallest odd number on this rank\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n\n  // Find the smallest odd number across all ranks\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_min = INT_MAX;\n  #pragma omp parallel for reduction(min : local_min)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1 && x[i] < local_min) {\n      local_min = x[i];\n    }\n  }\n\n  int global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  return global_min;\n}\n",
+            "\n\n  int n = x.size();\n  int my_min = INT_MAX;\n  #pragma omp parallel for reduction(min: my_min)\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 1 && x[i] < my_min) {\n      my_min = x[i];\n    }\n  }\n  int global_min;\n  MPI_Allreduce(&my_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_min;\n}\n",
+            "\n\n  int my_smallest = INT_MAX;\n  #pragma omp parallel for reduction(min: my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  int global_smallest;\n  MPI_Allreduce(&my_smallest, &global_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_smallest;\n}\n",
+            "\n\n  int smallest = INT_MAX;\n  int my_smallest = INT_MAX;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  #pragma omp parallel for reduction(min:my_smallest)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      my_smallest = std::min(my_smallest, x[i]);\n    }\n  }\n  MPI_Allreduce(&my_smallest, &smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return smallest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use Kokkos to compute product in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n  double product = 1.0;\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 0) {\n      product *= x(i);\n    } else {\n      product /= x(i);\n    }\n  });\n  return product;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n\n  double product = 1.0;\n\n  parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        if (i % 2 == 1) {\n          update *= 1.0 / x[i];\n        } else {\n          update *= x[i];\n        }\n      },\n      product);\n\n  return product;\n}\n",
+            "\n\n  // Create a Kokkos view for the output\n  Kokkos::View<double*> product = Kokkos::View<double*>(1);\n\n  // Initialize the product to 1\n  Kokkos::parallel_for(\"initialize_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int& i) {\n    product[i] = 1.0;\n  });\n\n  // Compute the product in parallel\n  Kokkos::parallel_for(\"compute_product\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  });\n\n  // Return the product\n  return product[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double product = 1.0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x[i];\n    } else {\n      update *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& result) {\n    if (i % 2 == 1) {\n      result *= 1.0 / x[i];\n    } else {\n      result *= x[i];\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i % 2 == 1) {\n      update *= 1.0 / x(i);\n    } else {\n      update *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x(i);\n                            } else {\n                              update *= x(i);\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double result = 1.0;\n  for (int i = 0; i < x.extent(0); i += 2) {\n    result *= x(i);\n  }\n  for (int i = 1; i < x.extent(0); i += 2) {\n    result /= x(i);\n  }\n  return result;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "_",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i % 2 == 1) {\n                              update *= 1.0 / x[i];\n                            } else {\n                              update *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int i, double& product) {\n                            if (i % 2 == 1) {\n                              product *= 1.0 / x[i];\n                            } else {\n                              product *= x[i];\n                            }\n                          },\n                          product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }, product);\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& product) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x(i);\n    } else {\n      product *= x(i);\n    }\n  }, product);\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use OpenMP to compute product in parallel.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*: product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < n; i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1.0 / x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i] / x[i + 1];\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1.0;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1.0 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+            "\n\n  double product = 1;\n  #pragma omp parallel for reduction(*:product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307848484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205700462
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030777707,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081123474
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307394366,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0115828266
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.031005087,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080851478
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0317074024,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020606158
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032326622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206122916
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0308713248,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081318176
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305766184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0119823848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0292166286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080939526
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0317548536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206813936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030834277,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207015876
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product = product * x[i] / (x[i + 1] + 0.000000001);\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0308026456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206293936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307169804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0081697752
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n  for (size_t i = 1; i < x.size(); i += 2) {\n    product /= x[i];\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product *= 1 / x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306069134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080995792
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307055226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020617245
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      product *= 1 / x[i];\n    } else {\n      product *= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307573822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0080832964
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double product = 1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 0) {\n      product *= x[i];\n    } else {\n      product /= x[i];\n    }\n  }\n  return product;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0303495948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020666668
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "hip",
+        "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use AMD HIP to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      atomicAdd(product, x[gid]);\n    } else {\n      atomicAdd(product, 1.0 / x[gid]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[tid] = x[tid];\n    } else {\n      product[tid] = 1 / x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      product[0] *= x[tid];\n    } else {\n      product[0] /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double prod = 1.0;\n  if (gid % 2 == 0) {\n    prod *= x[gid];\n  } else {\n    prod /= x[gid];\n  }\n\n  hipAtomicAdd(product, prod);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      atomicAdd(product, x[idx]);\n    } else {\n      atomicAdd(product, 1.0 / x[idx]);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 1) {\n      product[0] *= 1.0 / x[tid];\n    } else {\n      product[0] *= x[tid];\n    }\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      atomicAdd(product, x[tid]);\n    } else {\n      atomicAdd(product, 1.0 / x[tid]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a new vector to store the product\n  std::vector<double> product(n);\n\n  // Initialize the product to 1\n  product[0] = 1;\n\n  // Compute the product in parallel\n  for (int i = 1; i < n; i++) {\n    if (i % 2 == 1) {\n      product[i] = product[i - 1] * (1 / x[i]);\n    } else {\n      product[i] = product[i - 1] * x[i];\n    }\n  }\n\n  // Return the product\n  return product[n - 1];\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD,\n                MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Store the result in product.\n   Use CUDA to compute product in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\n__global__ void productWithInverses(const double *x, size_t N, double *product) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx % 2 == 0) {\n    *product *= x[idx];\n  } else {\n    *product /= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      *product *= x[idx];\n    } else {\n      *product /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      product[0] *= x[gid];\n    } else {\n      product[0] /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    product[idx] = 1.0;\n    for (int i = 0; i < N; i += 2) {\n      product[idx] *= x[i] / x[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 0) {\n      product[0] *= x[idx];\n    } else {\n      product[0] /= x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 1) {\n      product[0] *= 1.0 / x[gid];\n    } else {\n      product[0] *= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    if (i % 2 == 1) {\n      product[0] *= 1.0 / x[i];\n    } else {\n      product[0] *= x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (tid % 2 == 0) {\n      *product *= x[tid];\n    } else {\n      *product /= x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    if (gid % 2 == 0) {\n      *product *= x[gid];\n    } else {\n      *product /= x[gid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (idx % 2 == 1) {\n      product[0] *= 1.0 / x[idx];\n    } else {\n      product[0] *= x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "26_reduce_product_of_inverses",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the product of the vector x with every odd indexed element inverted.\n   i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n   Use MPI and OpenMP to compute product in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the product on all ranks.\n   Example:\n\n   input: [4, 2, 10, 4, 5]\n   output: 25\n*/\ndouble productWithInverses(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n#pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Reduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, 0, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*: local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for reduction(* : local_product)\n  for (int i = start_index; i < end_index; i++) {\n    if ((i + 1) % 2 == 0) {\n      local_product *= x[i];\n    } else {\n      local_product /= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product /= x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < x.size(); i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*: product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*: oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products to get the final result\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = rank; i < x.size(); i += size) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size();\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = 0; i < local_size; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  return global_product;\n}\n",
+            "\n\n  double product = 1.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the product of the even indexed elements\n  #pragma omp parallel for reduction(*:product)\n  for (int i = 0; i < x.size(); i += 2) {\n    product *= x[i];\n  }\n\n  // Compute the product of the odd indexed elements\n  double oddProduct = 1.0;\n  #pragma omp parallel for reduction(*:oddProduct)\n  for (int i = 1; i < x.size(); i += 2) {\n    oddProduct *= 1.0 / x[i];\n  }\n\n  // Reduce the odd product across all ranks\n  double globalOddProduct;\n  MPI_Allreduce(&oddProduct, &globalOddProduct, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n\n  // Multiply the even and odd products\n  product *= globalOddProduct;\n\n  return product;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_product = 1.0;\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for reduction(*:local_product)\n  for (int i = start; i < end; i++) {\n    if (i % 2 == 1) {\n      local_product *= 1.0 / x[i];\n    } else {\n      local_product *= x[i];\n    }\n  }\n\n  double global_product;\n  MPI_Allreduce(&local_product, &global_product, 1, MPI_DOUBLE, MPI_PROD, MPI_COMM_WORLD);\n  return global_product;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use Kokkos to sum in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(Kokkos::View<const double*> const& x, Kokkos::View<const double*> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Create a Kokkos view to store the minimum values at each index\n  Kokkos::View<double*> min_values(\"min_values\", x.extent(0));\n\n  // Compute the minimum values at each index in parallel\n  Kokkos::parallel_for(\"min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    min_values[i] = std::min(x[i], y[i]);\n  });\n\n  // Reduce the minimum values in parallel to get the sum\n  double sum = Kokkos::parallel_reduce(\"sum_min_values_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_values[i];\n  }, 0.0);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_values = Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                            KOKKOS_LAMBDA(const int& i, Reducer& red) {\n                                              red.init(std::min(x(i), y(i)));\n                                            });\n\n  return min_values.get_value();\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMemberFunction = Kokkos::TeamMemberFunction<TeamPolicy, double>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     TeamMemberFunction([&](const int& i) -> double { return std::min(x(i), y(i)); }),\n                                     0.0);\n  return sum;\n}\n",
+            "\n\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using l_double = Kokkos::LayoutRight;\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n\n  Kokkos::View<double*, l_double> min_values(\"min_values\", x.extent(0));\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update = std::min(x(i), y(i));\n  }, min_values);\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_values(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using Reducer = Kokkos::Min<double, ExecutionSpace, Layout>;\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                            sum += std::min(x(i), y(i));\n                          });\n\n  return sum.template get<0>();\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_left;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::DefaultExecutionSpace(), x.extent(0)), KOKKOS_LAMBDA(const TeamMember& team, double& sum) {\n    const auto i = team.league_rank();\n    sum += std::min(x(i), y(i));\n  });\n\n  return sum;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::StaticTeamSize<1024>>;\n  using TeamMember = Kokkos::TeamMember<TeamPolicy>;\n\n  auto sum = Kokkos::parallel_reduce(TeamPolicy(Kokkos::RangePolicy<ExecutionSpace, Kokkos::StaticSize<1>>(0, x.extent(0))),\n                                     KOKKOS_LAMBDA(TeamMember, sum, x, y) {\n                                       sum += std::min(x(TeamMember::team_rank()), y(TeamMember::team_rank()));\n                                     });\n  return sum;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  const auto min_xy = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) {\n        sum += std::min(x(i), y(i));\n      },\n      0.0);\n\n  return min_xy;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::make_span(x.data(), x.size()), 0.0,\n                           [](double a, double b) { return std::min(a, b); },\n                           [](double a, double b) { return a + b; });\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto min_lambda = KOKKOS_LAMBDA(const double& a, const double& b) { return std::min(a, b); };\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += min_lambda(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  // Create a Kokkos view of the sum\n  Kokkos::View<double*> sum(\"sum\", x.extent(0));\n\n  // Parallel loop to compute the sum of the minimum values\n  Kokkos::parallel_for(\"sum_of_minimum_elements\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    sum(i) = std::min(x(i), y(i));\n  });\n\n  // Reduce the sum in parallel\n  double sum_value = Kokkos::parallel_reduce(\n      \"sum_of_minimum_elements_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i, double& sum) { sum += sum(i); }, 0.0);\n\n  return sum_value;\n}\n",
+            "Kokkoss",
+            "\n\n  const int numElements = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::Range(0, numElements), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  });\n  return sum;\n}\n",
+            "\n\n  auto min_xy = Kokkos::min(x, y);\n  auto sum = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, min_xy.extent(0)),\n                                    KOKKOS_LAMBDA(const int& i) -> double { return min_xy(i); },\n                                    (double)0);\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_lambda = KOKKOS_LAMBDA(const double& x, const double& y) { return std::min(x, y); };\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += min_lambda(x(i), y(i));\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  double sum = 0;\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += std::min(x(i), y(i));\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto min_values = min(x, y);\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, min_values.extent(0)), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += min_values(i);\n  }, sum);\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use OpenMP to sum in parallel.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  double sum = 0;\n\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            ".",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma  openmp parallel for reduction (+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0429590016,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019579027
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0427040078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194656238
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0429739916,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0195831588
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422974344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194937904
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0421738156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019254901
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422091724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192263392
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.042546435,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194053944
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0423828272,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193127622
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0425287118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194205424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0424652062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192481604
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422522216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193134936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422241124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192349776
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422823944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191966738
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.042390249,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0191467254
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0428650986,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019546636
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.042740741,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0194482532
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0426380616,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0193818458
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0425141882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192926844
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0422698004,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192197728
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0428623034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192444272
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use AMD HIP to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[tid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    sum[gid] = min(x[gid], y[gid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    sum[tid] = min(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  sum[tid] = min(x[tid], y[tid]);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = min(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum = 0;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Store the result in sum.\n   Use CUDA to sum in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\n__global__ void sumOfMinimumElements(const double *x, const double *y, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ",  Synagogues_",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[0] += fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    sum[idx] = fmin(x[idx], y[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[0] += fmin(x[tid], y[tid]);\n  }\n}\n",
+            "{",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  sum[tid] = fmin(x[tid], y[tid]);\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    sum[tid] = fmin(x[i], y[i]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = fmin(x[tid], y[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "29_reduce_sum_of_min_of_pairs",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n   i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n   Use MPI and OpenMP to sum in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the sum on all ranks.\n   Example:\n\n   input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n   output: 10\n*/\ndouble sumOfMinimumElements(std::vector<double> const& x, std::vector<double> const& y) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n  double sum = 0.0;\n  int n = x.size();\n  #pragma opm parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+ : local_sum)\n  for (int i = start; i < end; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += std::min(x[i], y[i]);\n  }\n  return sum;\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < n; i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int num_elements = x.size();\n  int elements_per_rank = num_elements / num_ranks;\n  int start_index = my_rank * elements_per_rank;\n  int end_index = start_index + elements_per_rank;\n\n  for (int i = start_index; i < end_index; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  double sum = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += std::min(x[i], y[i]);\n  }\n\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double sum;\n  MPI_Allreduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += std::min(x[i], y[i]);\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the average of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::sum;\n\n  const int N = x.extent(0);\n\n  double sum_x = 0.0;\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum_x);\n\n  return sum_x / N;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  double sum = 0;\n  parallel_reduce(RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                  KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, sum);\n\n  double count = x.extent(0);\n  return sum / count;\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(Policy(0, x.extent(0)), Reducer(0.0),\n                                    KOKKOS_LAMBDA(const int& i, Reducer& sum) {\n                                      sum += x(i);\n                                    });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  double sum = 0;\n  auto policy = Kokkos::RangePolicy<>(0, x.size());\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_double = Kokkos::LDDouble;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, l_double& sum) {\n        sum += x(i);\n      },\n      l_double(0.0));\n\n  return static_cast<double>(sum) / x.extent(0);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  const auto sum = Kokkos::parallel_reduce(Policy(0, x.size()), Reducer(0.0),\n                                          KOKKOS_LAMBDA(const int& i, const Reducer& sum) {\n                                            return sum + x(i);\n                                          });\n\n  return sum / x.size();\n}\n",
+            "\n\n  typedef Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy_type;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  return Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_EXEC_SPACE, x, 0.0, Kok\u30b3\u30af::plus<double>(), Kok\u30b3\u30af::divide<double>(x.size()));\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::RangePolicy;\n\n  auto sum = parallel_reduce(\n      RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); }, 0.0);\n\n  return sum / x.size();\n}\n",
+            "\n\n  using execution_space = typename Kokkos::View<const double*>::execution_space;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto sum = Kokkos::parallel_reduce(\n      policy_type(x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n        sum += x(i);\n      },\n      (double)0.0, tag_type());\n\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n",
+            "\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.size();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<int>;\n  double sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(int i, double& sum) { sum += x(i); }, sum);\n  return sum / x.size();\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using reduction_type = Kokkos::Sum<double>;\n\n  auto sum = Kokkos::parallel_reduce(policy_type(0, x.size()), reduction_type(0.0),\n                                     KOKKOS_LAMBDA(const int& i, double& sum) { sum += x(i); });\n\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += x(i);\n  });\n  return sum / x.extent(0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the average of the vector x.\n   Use OpenMP to compute in parallel.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n  return sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the average of the vector x.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double n : x) {\n    sum += n;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084902768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160623146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084966324,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160414774
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084436244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161265798
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083538672,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159346522
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084277992,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161083476
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.008351403,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016027816
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084638348,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159981258
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083753318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016064008
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double num : x) {\n    sum += num;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.008414185,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160454328
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083298106,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016076924
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084728478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160440644
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084758476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0161161082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084311774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016071354
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.008433221,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159644186
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083474816,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160899226
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084605524,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.016020883
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083860522,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160993342
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083827922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.01610137
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0083573482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0159633944
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double v : x) {\n    sum += v;\n  }\n  return sum / x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0084837442,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0160609804
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (size_t i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double block_sum;\n  block_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = block_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(average, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n    double sum = 0;\n    size_t idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    if (idx < N) {\n        sum += x[idx];\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n        *average = sum / N;\n    }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum = 0;\n  for (int i = gid; i < N; i += hipBlockDim_x * hipGridDim_x) {\n    sum += x[i];\n  }\n\n  __shared__ double sharedSum;\n  sharedSum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = sharedSum / N;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    average[0] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = block.x;\n  if (tid == 0) {\n    *x = 0;\n    for (int i = 0; i < N; i++) {\n      *x += x[i];\n    }\n    *x = *x / N;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the average of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0.0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0.0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (auto v : x) {\n    sum += v;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg = sum / x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  double avg;\n  MPI_Allreduce(&sum, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= x.size();\n  return avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / x.size();\n  double global_avg;\n  MPI_Allreduce(&avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n  return global_avg;\n}\n",
+            "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double avg = sum / size;\n  return avg;\n}\n",
+            "\n\n  double local_sum = 0;\n  for (double xi : x) {\n    local_sum += xi;\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the average of the vector x. Store the result in average.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\n__global__ void average(const double *x, size_t N, double *average) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    average[0] += x[idx];\n  }\n}\n",
+            "\n\n  double sum = 0;\n  for (size_t i = block_idx.x * block_dim.x + thread_idx.x; i < N; i += block_dim.x * grid_dim.x) {\n    sum += x[i];\n  }\n  * average = sum / N;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  __shared__ double sum;\n  sum = 0;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n  for (int i = tid; i < N; i += bid_size * grid_size) {\n    sum += x[i];\n  }\n\n  __shared__ double shared_sum;\n  shared_sum = sum;\n  __syncthreads();\n\n  if (tid == 0) {\n    *average = shared_sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  double sum = 0;\n  for (int i = tid + bid * bid_size; i < N; i += grid_size * bid_size) {\n    sum += x[i];\n  }\n  double avg = sum / N;\n  *average = avg;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(average, x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  __shared__ double sum;\n  if (i < N) {\n    sum += x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    *average = sum / N;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  double sum = 0;\n  if (tid < N) {\n    sum = x[tid];\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    double sum_all = 0;\n    for (int i = 0; i < gridDim.x; i++) {\n      sum_all += sum;\n    }\n    *average = sum_all / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "27_reduce_average",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the average of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the average on all ranks.\n   Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n   output: 3.8\n\n   input: [2, 2, 2, 3]\n   output: 2.25\n*/\ndouble average(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_avg = 0.0;\n  #pragma omp parallel for reduction(+:local_avg)\n  for (int i = 0; i < x.size(); ++i) {\n    local_avg += x[i];\n  }\n  local_avg /= x.size();\n\n  double global_avg;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= size;\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n  }\n\n  double avg = sum / x.size();\n  MPI_Allreduce(&avg, &avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  avg /= size;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a sum variable to store the sum of the elements in the vector\n  double sum = 0.0;\n\n  // Use OpenMP to parallelize the loop over the elements in the vector\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    // Add the current element to the sum\n    sum += x[i];\n  }\n\n  // Get the average by dividing the sum by the number of elements\n  double avg = sum / n;\n\n  // Return the average\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  double local_avg = 0;\n  int local_size = x.size();\n  double sum = 0;\n\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < local_size; i++) {\n    sum += x[i];\n  }\n\n  local_avg = sum / local_size;\n\n  double global_avg = 0;\n  MPI_Allreduce(&local_avg, &global_avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  global_avg /= MPI_Comm_size(MPI_COMM_WORLD);\n\n  return global_avg;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / x.size();\n}\n",
+            "\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  return global_sum / x.size();\n}\n",
+            "&",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n  }\n\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n",
+            "\n\n  double sum = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  return sum / n;\n}\n",
+            "\n\n  double sum = 0.0;\n  int n = x.size();\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += x[i];\n  }\n  double avg = sum / n;\n  return avg;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  return global_sum / size;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use Kokkos to reduce in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(Kokkos::View<const bool*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  auto xor_reduce = reduce(x, (bool)false, [](bool a, bool b) { return a ^ b; });\n  return xor_reduce;\n}\n",
+            "\n\n  // Create a Kokkos::TeamPolicy for the reduction.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the reduction functor.\n  struct XORFunctor {\n    bool result = false;\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const bool& x) const { result ^= x; }\n  };\n\n  // Create a Kokkos::ParallelReduce object to perform the reduction.\n  Kokkos::ParallelReduce<policy, XORFunctor> reduce(result);\n\n  // Invoke the reduction.\n  Kokkos::parallel_for(policy, reduce);\n\n  // Return the result.\n  return reduce.result;\n}\n",
+            "\n\n  bool result = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& result) { result ^= x(i); }, result);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  bool result = false;\n  auto policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0));\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  }, result);\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using TeamMemberType = typename Policy::member_type;\n\n  bool result = false;\n\n  Kokkos::parallel_reduce(Policy(1), KOKKOS_LAMBDA(const TeamMemberType& team) {\n    bool local_result = false;\n    const bool* x_data = x.data();\n    for (int i = team.league_rank(); i < x.size(); i += team.league_size()) {\n      local_result ^= x_data[i];\n    }\n    team.team_reduce(KOKKOS_LAMBDA(const bool& a, const bool& b) { return a ^ b; }, local_result);\n  }, result);\n\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x[i];\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using TeamMemberType = typename Policy::member_type;\n\n  auto reduce = KOKKOS_LAMBDA(TeamMemberType team) {\n    bool local_result = false;\n    for (int i = team.league_rank(); i < x.extent(0); i += team.league_size()) {\n      local_result ^= x(i);\n    }\n    return local_result;\n  };\n\n  bool global_result = Kokkos::parallel_reduce(Policy(x.extent(0)), reduce);\n  return global_result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Schedule<Kokkos::Static>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(x), Kokkos::Scan);\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i) {\n        result ^= x(i);\n      });\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result = result ^ x(i);\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::Rank<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  bool result = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& result) {\n    if (x(i)) {\n      result = !result;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return parallel_reduce(Policy(x.extent(0)), Reducer(false), x);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace, Kokkos::ThreadsPerTeam<1>>;\n  using Reducer = Kokkos::LogicalXor<bool>;\n\n  return Kokkos::parallel_reduce(Policy(x.extent(0)), Reducer(false),\n                                 KOKKOS_LAMBDA(const int& i, bool& result) {\n                                   result = result ^ x(i);\n                                 });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto reduce_functor = KOKKOS_LAMBDA(const bool& a, const bool& b) {\n    return a ^ b;\n  };\n\n  auto reduce_result = reduce(x, reduce_functor, false);\n\n  return reduce_result;\n}\n",
+            "\n\n  bool result = x(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(1, x.extent(0)), KOKKOS_LAMBDA(const int& i, bool& result) {\n    result ^= x(i);\n  });\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use OpenMP to reduce in parallel.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  {\n    for (auto const& xi : x) {\n      result ^= xi;\n    }\n  }\n  return result;\n}\n",
+            ")",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(+:result)\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel for reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = false;\n#pragma omp parallel reduction(^:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131038188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131469204
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131367592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131140322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013144724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130585496
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131275318,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131104234
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130670118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131296478
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130801468,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013109167
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130852058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130916282
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131004188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130826932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01298924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131041006
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131178782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0131425302
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131416832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130265422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01311788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130388054
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0131260574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130483154
+                    }
+                ]
+            },
+            {
+                "generated_output": ")",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130209138,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130619828
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130213102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130091934
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0130379716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130612878
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool xi : x) {\n    result ^= xi;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0128996634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130150146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool result = false;\n  for (bool b : x) {\n    result ^= b;\n  }\n  return result;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0129986418,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0129039738
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use AMD HIP to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n  bool local_result = x[gid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid == 0) {\n      local_result ^= x[gid + blockDim.x];\n    }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  bool local_result = x[bid * size + tid];\n  __syncthreads();\n  for (int i = size / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      local_result ^= x[bid * size + tid + i];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int warp_size = warpSize;\n  int num_warps = bid_size / warp_size;\n  int warp_id = tid / warp_size;\n  int lane_id = tid % warp_size;\n  bool warp_result = x[bid * bid_size + tid];\n  for (int i = 1; i < warp_size; i++) {\n    warp_result ^= x[bid * bid_size + tid + i];\n  }\n  __shared__ bool shared_result[num_warps];\n  shared_result[warp_id] = warp_result;\n  __syncthreads();\n  if (warp_id == 0) {\n    bool result = shared_result[0];\n    for (int i = 1; i < num_warps; i++) {\n      result ^= shared_result[i];\n    }\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  bool local_result = x[tid];\n  __syncthreads();\n  while (blockDim.x <= N) {\n    if (tid < N) { local_result ^= x[tid + blockDim.x]; }\n    __syncthreads();\n    blockDim.x *= 2;\n    __syncthreads();\n  }\n  if (tid == 0) { *output = local_result; }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  bool local_result = x[gid];\n\n  __syncthreads();\n\n  while (bid > 0) {\n    if (tid == 0) {\n      local_result ^= x[gid + hipBlockDim_x];\n    }\n    bid >>= 1;\n    gid >>= 1;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  bool local_result = x[gid];\n\n  while (gid < N) {\n    local_result ^= x[gid];\n    gid += hipBlockDim_x;\n  }\n\n  output[tid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  bool result = false;\n  while (gid < N) {\n    result ^= x[gid];\n    gid += hipBlockDim_x * hipGridDim_x;\n  }\n  output[tid] = result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allreduce(&local_result, &local_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool result = x[rank];\n  MPI_Allreduce(&result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[rank];\n  bool global_result;\n\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  MPI_Allxor(&result, &result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  bool result = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool local_result = x[rank];\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the logical XOR reduction of the vector of bools x. Store the result in output.\n   Use CUDA to reduce in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\n__global__ void reduceLogicalXOR(const bool *x, size_t N, bool *output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread reduces a single element of the output.\n  int tid = threadIdx.x;\n  output[tid] = x[tid];\n  __syncthreads();\n\n  // Reduce the output in place.\n  while (tid < N) {\n    output[tid] ^= output[tid + blockDim.x];\n    tid += blockDim.x * 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < N; tid += 2 * stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread reduces a subset of the input.\n  int tid = threadIdx.x;\n  int stride = blockDim.x;\n  bool local_result = false;\n  for (int i = tid; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  // Reduce the local results using a shared memory reduction.\n  __shared__ bool shared_result[blockDim.x];\n  shared_result[tid] = local_result;\n  __syncthreads();\n\n  // The first thread writes the final result to output.\n  if (tid == 0) {\n    bool final_result = false;\n    for (int i = 0; i < blockDim.x; ++i) {\n      final_result ^= shared_result[i];\n    }\n    *output = final_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the output\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the output in place\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid] ^= output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncbarriers();\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    __syncbarriers();\n    N /= 2;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    result ^= x[i];\n  }\n  output[bid] = result;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  bool result = x[bid * stride + tid];\n  __syncthreads();\n  while (stride > 1) {\n    if (tid < stride) {\n      result ^= x[bid * stride + tid + stride];\n    }\n    stride /= 2;\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int num_elements = N;\n  int num_blocks = (num_elements + bid_size - 1) / bid_size;\n\n  bool local_result = x[tid];\n  __syncthreads();\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result ^= x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n  if (bid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  // Each thread reduces a single element of the vector\n  int tid = threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n\n  // Reduce the vector in parallel\n  while (N > 1) {\n    if (tid < N / 2) {\n      output[tid] ^= output[tid + N / 2];\n    }\n    N /= 2;\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = blockDim.x; i < N; i += blockDim.x) {\n      output[0] ^= output[i];\n    }\n  }\n}\n",
+            "\n\n  // Each thread reduces a single element of the output.\n  int idx = threadIdx.x;\n  output[idx] = x[idx];\n\n  // Each thread reduces a pair of elements of the output.\n  while (idx < N) {\n    output[idx] ^= x[idx + 1];\n    idx += 2;\n  }\n\n  // Reduce the remaining elements of the output.\n  for (int i = 1; i < N; i *= 2) {\n    for (int j = 0; j < N; j += 2 * i) {\n      output[j] ^= output[j + i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool local_result = x[tid];\n\n  for (int i = tid + stride; i < N; i += stride) {\n    local_result ^= x[i];\n  }\n\n  output[bid] = local_result;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[0] ^= x[tid];\n  }\n  __syncthreads();\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int num_blocks = gridDim.x;\n\n  bool local_result = x[tid];\n\n  __syncthreads();\n\n  while (bid_size > 1) {\n    if (tid < bid_size / 2) {\n      local_result = local_result ^ x[tid + bid_size / 2];\n    }\n    bid_size /= 2;\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  bool result = x[bid * stride + tid];\n\n  __syncthreads();\n\n  for (int offset = stride; offset < N; offset *= 2) {\n    if (tid < offset) {\n      result ^= x[bid * stride + tid + offset];\n    }\n    __syncthreads();\n  }\n\n  if (tid == 0) {\n    *output = result;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x * gridDim.x;\n\n  bool local_result = x[tid + bid * stride];\n\n  for (int i = 2 * stride; i <= N; i *= 2) {\n    if (tid < N / i) {\n      local_result ^= x[tid + bid * i];\n    }\n  }\n\n  if (tid == 0) {\n    *output = local_result;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "reduce",
+        "language": "cpp",
+        "name": "25_reduce_xor",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the logical XOR reduction of the vector of bools x.\n   Use MPI and OpenMP to reduce in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on all ranks.\n   Example:\n\n   input: [false, false, false, true]\n   output: true\n*/\nbool reduceLogicalXOR(std::vector<bool> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^: local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (auto b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Reduce on each rank\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  // Reduce across all ranks\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return global_result;\n}\n",
+            "\n\n  int nranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n\n  bool local_result = false;\n  for (bool b : x) {\n    local_result ^= b;\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR,\n                MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int nranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = 1; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int n = x.size();\n  bool result = x[0];\n  #pragma omp parallel for reduction(^:result)\n  for (int i = 1; i < n; ++i) {\n    result ^= x[i];\n  }\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  MPI_Allxor(&local_result, &local_result, 1, MPI_BOOL, MPI_COMM_WORLD);\n  return local_result;\n}\n",
+            ".",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool local_result = x[my_rank];\n  #pragma omp parallel reduction(^:local_result)\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      local_result ^= x[i];\n    }\n  }\n\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = x[rank];\n  #pragma omp parallel reduction(^:local_result)\n  for (int i = 0; i < x.size(); i++) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Reduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, 0, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  bool local_result = x[0];\n  #pragma omp parallel for reduction(^:local_result)\n  for (int i = 1; i < n; ++i) {\n    local_result ^= x[i];\n  }\n  bool global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  bool result = x[0];\n  #pragma omp parallel reduction(^:result)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); ++i) {\n      result ^= x[i];\n    }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool local_result = false;\n  #pragma omp parallel for reduction(^:local_result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_result ^= x[i];\n  }\n\n  bool result;\n  MPI_Allreduce(&local_result, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Reduce the vector of bools using MPI_Reduce.\n  bool result;\n  MPI_Allreduce(MPI_IN_PLACE, &x[0], x.size(), MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  // Return the result.\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the reverse prefix sum of the array x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(Kokkos::View<const int*> const& x, Kokkos::View<int*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = i; j >= 0; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output(i) = 0;\n    for (int j = i; j < n; ++j) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> temp = output.slice(0, output.size() - 1);\n  Kok\u30b3\u30af::View<int> temp2 = output.slice(1, output.size());\n  Kok\u30b3\u30af::View<const int> x_rev = x.reversed();\n  Kok\u30b3\u30af::View<int> temp3 = x_rev.slice(1, x_rev.size());\n  Kok\u30b3\u30af::View<int> temp4 = x_rev.slice(0, x_rev.size() - 1);\n  Kok\u30b3\u30af::View<int> temp5 = temp4 + temp3;\n  temp = temp5;\n  temp2 = temp5 + temp;\n\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, i),\n                                        KOKKOS_LAMBDA(const int& j, const int& val) { return val + x(i - j - 1); },\n                                        0);\n  });\n}\n",
+            "(",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                          KOKKOS_LAMBDA(const int& i, int& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = x(i) + output(i - 1);\n                            }\n                          },\n                          output(n - 1));\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) { output(i) = output(n - 1) - output(i); });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i, int& sum) {\n    sum = (i == 0) ? x(i) : sum + x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  // Compute the reverse prefix sum\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          Kokkos::ScanSum<int>(Kokkos::Reverse));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          output(N - 1));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA (const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const int i) {\n    output(i) = 0;\n    for (int j = n - 1; j >= i; j--) {\n      output(i) += x(j);\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"reversePrefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    output(i) = (i == 0) ? x(i) : x(i) + output(i - 1);\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<int>;\n  using Tag = Kokkos::TagType<int, int>;\n\n  Kokkos::parallel_for(Policy(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    output(i) = x(x.extent(0) - 1 - i);\n    if (i > 0) output(i) += output(i - 1);\n  }, Tag());\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                          KOKKOS_LAMBDA(const int i, int& sum) {\n                            sum += x(i);\n                            output(i) = sum;\n                          },\n                          0);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(const int& i, int& sum) {\n    sum += x(i);\n    output(i) = sum;\n  }, sum = 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i + 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    output[i] = 0;\n    for (int j = n - 1; j >= i; --j) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = 0;\n    for (int j = i; j < n; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  #pragma omp parallel for\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; --i) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = x.size() - 1; i >= 0; --i) {\n    if (i + 1 < x.size()) {\n      output[i] = x[i] + output[i + 1];\n    } else {\n      output[i] = x[i];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output.back() = x.back();\n  for (int i = x.size() - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    output[i] = output[i + 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n  if (tid > 0) {\n    output[tid] += output[tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[N - 1 - gid];\n    if (gid > 0) { output[gid] += output[gid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[N - 1 - gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = N - 2; i >= 0; i--) {\n    if (gid < N) {\n      output[i] = output[i + 1] + x[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = N - 1; j >= i; j--) { output[i] += x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = blockDim.x / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N) {\n    output[offset] = x[N - 1 - offset];\n    if (offset > 0) {\n      output[offset] += output[offset - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - tid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N / 2; i > 0; i /= 2) {\n    if (tid < i) {\n      output[tid] += output[tid + i];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in reverse order\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    prefix_sum[i] = prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the prefix sums from all ranks\n  std::vector<int> gathered_prefix_sums(x.size() * size);\n  MPI_Gather(prefix_sum.data(), x.size(), MPI_INT, gathered_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_prefix_sums[i * size + size - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_sum = 0;\n  for (int i = n - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n, 0);\n  std::vector<int> global_sums(n, 0);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.reserve(n);\n  std::iota(std::execution::par, output.begin(), output.end(), 0);\n  std::transform(std::execution::par, x.rbegin(), x.rend(), output.rbegin(),\n                 [](int a, int b) { return a + b; });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<int> prefix_sum(n);\n  MPI_Allreduce(x.data(), prefix_sum.data(), n, MPI_INT, MPI_SUM, comm);\n\n  if (rank == 0) {\n    output.resize(n);\n    for (int i = 0; i < n; i++) {\n      output[i] = prefix_sum[n - 1 - i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      output[i] = global_output[i];\n    } else {\n      output[i] = global_output[i] - global_output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n  for (int i = n - 1; i >= 0; i--) {\n    if (rank == 0) {\n      output[i] = local_x[i];\n    }\n    MPI_Allreduce(MPI_IN_PLACE, &local_x[i], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n    if (rank == 0) {\n      local_output[i] = output[i];\n    }\n  }\n  if (rank == 0) {\n    output = local_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_sum = 0;\n  for (int i = x.size() - 1; i >= 0; i--) {\n    local_sum += x[i];\n    if (rank == 0) {\n      output[i] = local_sum;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    if (i > 0) {\n      partial_sums[i] += partial_sums[i - 1];\n    }\n  }\n\n  // Gather the partial sums from all ranks\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    for (int i = n - 1; i >= 0; i--) {\n      if (i < n - 1) {\n        global_sums[i] += global_sums[i + 1];\n      }\n      output[i] = global_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> partial_sums(n);\n  std::partial_sum(x.rbegin(), x.rend(), partial_sums.rbegin());\n\n  std::vector<int> global_sums(n);\n  MPI_Allgather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i == n - 1) {\n      local_output[i] = x[i];\n    } else {\n      local_output[i] = x[i] + local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the global prefix sum\n  for (int i = 0; i < n / 2; i++) {\n    std::swap(global_output[i], global_output[n - i - 1]);\n  }\n\n  // Store the result in output on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum\n  std::vector<int> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = prefix_sum[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] - x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    output = reverse_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> prefix_sum(n);\n  std::vector<int> reverse_prefix_sum(n);\n\n  // Compute the prefix sum\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the reverse prefix sum\n  reverse_prefix_sum[n - 1] = prefix_sum[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + prefix_sum[i];\n  }\n\n  // Gather the reverse prefix sum on rank 0\n  MPI_Gather(&reverse_prefix_sum[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  for (int i = n - 1; i >= 0; i--) {\n    if (i + 1 < n) {\n      global_output[i] -= global_output[i + 1];\n    }\n  }\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> global_sums(n);\n\n  // Compute the partial sums on each rank\n  partial_sums[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    partial_sums[i] = partial_sums[i + 1] + x[i];\n  }\n\n  // Gather the partial sums into the global_sums vector on rank 0\n  MPI_Gather(partial_sums.data(), n, MPI_INT, global_sums.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sums on rank 0\n  if (rank == 0) {\n    output[0] = global_sums[0];\n    for (int i = 1; i < n; i++) {\n      output[i] = output[i - 1] + global_sums[n - i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x in reverse order\n  std::vector<int> reversePrefixSum(x.size());\n  reversePrefixSum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reversePrefixSum[i] = reversePrefixSum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks\n  std::vector<int> gatheredReversePrefixSum(x.size() * size);\n  MPI_Gather(reversePrefixSum.data(), x.size(), MPI_INT, gatheredReversePrefixSum.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the final reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gatheredReversePrefixSum[i * size];\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the prefix sum of the vector x in reverse order.\n  std::vector<int> reverse_prefix_sum(x.size());\n  reverse_prefix_sum[x.size() - 1] = x[x.size() - 1];\n  for (int i = x.size() - 2; i >= 0; i--) {\n    reverse_prefix_sum[i] = reverse_prefix_sum[i + 1] + x[i];\n  }\n\n  // Gather the reverse prefix sums from all ranks.\n  std::vector<int> gathered_reverse_prefix_sums(x.size() * num_ranks);\n  MPI_Gather(reverse_prefix_sum.data(), x.size(), MPI_INT, gathered_reverse_prefix_sums.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Compute the output vector.\n  if (my_rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = gathered_reverse_prefix_sums[i * num_ranks + my_rank];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[n - i - 1];\n  }\n\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<int> recvbuf(n);\n  std::vector<int> sendbuf(n);\n  for (int i = 0; i < n; i++) {\n    sendbuf[i] = x[n - i - 1];\n  }\n  MPI_Allgather(sendbuf.data(), n, MPI_INT, recvbuf.data(), n, MPI_INT, MPI_COMM_WORLD);\n  int sum = 0;\n  for (int i = 0; i < n; i++) {\n    sum += recvbuf[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\n__global__ void reversePrefixSum(const int *x, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = block_size / 2; j > 0; j /= 2) {\n    if (tid < j) {\n      output[tid] += output[tid + j];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = N - 1; i >= 0; i--) {\n    if (tid == i) {\n      output[i] += output[i + 1];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[N - 1 - tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int i = blockIdx.x * blockDim.x + tid;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (int j = i + blockDim.x; j < N; j += blockDim.x) {\n    output[j] = output[j - blockDim.x] + x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum += x[i];\n    output[N - 1 - i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    output[i] = 0;\n    for (int j = i + 1; j < N; j++) {\n      output[i] += x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = bid * stride;\n  int sum = 0;\n  for (int i = N - 1 - tid; i >= 0; i -= stride) {\n    sum += x[i + offset];\n    output[i + offset] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_id * block_size + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int offset = block * stride + tid;\n\n  if (offset < N) {\n    output[offset] = x[offset];\n  }\n\n  __syncthreads();\n\n  for (int i = offset + stride; i < N; i += stride) {\n    output[i] = output[i - stride] + x[i];\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  if (tid == 0) {\n    output[0] = x[0];\n  } else {\n    output[tid] = output[tid - 1] + x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int i = block_size * block_id + tid;\n  if (i < N) {\n    output[i] = x[N - 1 - i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int end = min(start + stride, N);\n  int sum = 0;\n  for (int i = end - 1; i >= start; i--) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "33_scan_reverse_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the reverse prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Examples:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [2, 8, 14, 18, 25, 26]\n\n   input: [3, 3, 7, 1, -2]\n   output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector<int> const& x, std::vector<int> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n  std::vector<int> reverse_partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; ++j) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Compute the reverse partial sums in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; --i) {\n    reverse_partial_sums[i] = partial_sums[i];\n    if (i > 0) {\n      reverse_partial_sums[i] += reverse_partial_sums[i - 1];\n    }\n  }\n\n  // Gather the reverse partial sums on rank 0\n  MPI_Gather(&reverse_partial_sums[0], n, MPI_INT, &output[0], n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_prefix_sum[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_prefix_sum[i] - global_prefix_sum[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = local_x[n - 1 - i];\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  std::vector<int> local_prefix_sum(x.size());\n  local_prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the global prefix sum\n  std::vector<int> global_prefix_sum(x.size());\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum\n  if (rank == 0) {\n    output.resize(x.size());\n    for (int i = 0; i < x.size(); i++) {\n      output[i] = global_prefix_sum[x.size() - 1] - global_prefix_sum[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int n = x.size();\n  std::vector<int> partial_sums(n);\n\n  // Compute the partial sums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sums[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      partial_sums[j] += partial_sums[j - 1];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<int> global_sums(n);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Compute the reverse prefix sum on rank 0\n  if (my_rank == 0) {\n    output.resize(n);\n    output[n - 1] = global_sums[n - 1];\n    for (int i = n - 2; i >= 0; i--) {\n      output[i] = global_sums[i] - global_sums[i + 1];\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local reverse prefix sum\n  std::vector<int> local_output(n);\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  std::vector<int> global_output(n);\n  MPI_Allreduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  std::vector<int> global_output(n);\n  MPI_Reduce(local_output.data(), global_output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_output[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      local_output[i] += x[j];\n    }\n  }\n\n  // Reduce the local prefix sums to the global prefix sum on rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Reverse the prefix sum on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n / 2; i++) {\n      std::swap(output[i], output[n - i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] += prefix_sum[i + 1];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (my_rank == 0) {\n    output = prefix_sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &n_procs);\n  int local_n = n / n_procs;\n  int remainder = n % n_procs;\n  if (my_rank == 0) {\n    output.resize(n);\n  }\n  std::vector<int> local_x(local_n + (my_rank < remainder ? 1 : 0));\n  MPI_Scatter(x.data(), local_n, MPI_INT, local_x.data(), local_n, MPI_INT, 0,\n              MPI_COMM_WORLD);\n  std::vector<int> local_output(local_n + (my_rank < remainder ? 1 : 0));\n  local_output[local_n - 1] = local_x[local_n - 1];\n  #pragma omp parallel for\n  for (int i = local_n - 2; i >= 0; --i) {\n    local_output[i] = local_x[i] + local_output[i + 1];\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_INT, output.data(), local_n, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reverse the prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    output[i] = prefix_sum[n - 1 - i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nranks; i++) {\n      std::vector<int> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    if (i < n - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute the local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce the local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum in parallel\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum in parallel\n  #pragma omp parallel for\n  for (int i = x.size() - 1; i >= 0; i--) {\n    if (i < x.size() - 1) {\n      prefix_sum[i] -= prefix_sum[i + 1];\n    }\n    output[i] = prefix_sum[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n\n  // Compute local reverse prefix sum\n  local_output[n - 1] = x[n - 1];\n  for (int i = n - 2; i >= 0; i--) {\n    local_output[i] = local_output[i + 1] + x[i];\n  }\n\n  // Reduce local reverse prefix sums to rank 0\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<int> local_output(n);\n  std::vector<int> local_x(n);\n  std::copy(x.begin(), x.end(), local_x.begin());\n\n  #pragma omp parallel for\n  for (int i = n - 1; i >= 0; i--) {\n    local_output[i] = local_x[i];\n    if (i + 1 < n) {\n      local_output[i] += local_output[i + 1];\n    }\n  }\n\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of the vector x\n  std::vector<int> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the reverse prefix sum of the vector x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    output[i] = prefix_sum[x.size() - 1] - prefix_sum[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&output[0], x.size(), MPI_INT, &output[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  using namespace Kokkos;\n\n  const size_t n = x.extent(0);\n  View<double*> y(\"y\", n);\n\n  parallel_for(n, KOKKOS_LAMBDA(const size_t i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return sum(y);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<double*, l_d_type>;\n\n  const size_t n = x.extent(0);\n  l_d_view_type x_copy(\"x_copy\", n);\n  Kokkos::deep_copy(x_copy, x);\n  Kokkos::parallel_scan(policy_type(0, n), KOKKOS_LAMBDA(const size_t& i, l_d_view_type& x_copy) {\n    if (i > 0) { x_copy(i) += x_copy(i - 1); }\n  }, tag_type());\n  double sum = 0;\n  for (size_t i = 0; i < n; ++i) { sum += x_copy(i); }\n  return sum;\n}\n",
+            "\n  using namespace Kokkos;\n\n  const int N = x.extent(0);\n  View<double*> y(\"y\", N);\n\n  parallel_for(N, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      y(i) = x(i);\n    } else {\n      y(i) = y(i - 1) + x(i);\n    }\n  });\n\n  return reduce(N, KOKKOS_LAMBDA(int i, double sum) { return sum + y(i); }, 0.0);\n}\n",
+            "\n\n  using ExecutionSpace = typename Kokkos::View<const double*>::execution_space;\n  using Layout = typename Kokkos::View<const double*>::layout_type;\n  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Layout>>;\n  using TeamMember = typename TeamPolicy::member_type;\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(TeamPolicy(Kokkos::Experimental::partition_by_coordinates(x)),\n                          KOKKOS_LAMBDA(TeamMember& team) {\n                            double local_sum = 0.0;\n                            for (auto i = team.league_rank(); i < x.extent(0);\n                                 i += team.league_size()) {\n                              local_sum += x(i);\n                            }\n                            team.team_reduce(KOKKOS_LAMBDA(double& tsum,\n                                                            double& lsum) { tsum += lsum; });\n                            if (team.team_rank() == 0) { sum = team.team_reduce(); }\n                          });\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n  using policy_type = Kokkos::RangePolicy<execution_space>;\n\n  // Compute the prefix sum array\n  View<double*> prefix_sum(\"prefix_sum\", x.size());\n  parallel_for(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum);\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using execution_space = Kokkos::DefaultExecutionSpace;\n\n  const int n = x.extent(0);\n  auto y = Kokkos::View<double*, execution_space>(\"y\", n);\n\n  Kokkos::parallel_for(\n      \"prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          y(i) = x(i);\n        } else {\n          y(i) = y(i - 1) + x(i);\n        }\n      });\n\n  return Kokkos::parallel_reduce(\n      \"sum_prefix_sum\", policy_t(0, n), KOKKOS_LAMBDA(const double& a,\n                                                       const double& b) {\n        return a + b;\n      },\n      y);\n}\n",
+            "\n  using namespace Kokkos;\n\n  using policy_type = Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>;\n  policy_type policy(x.extent(0));\n\n  double sum = 0.0;\n  Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(const int& i, double& sum) {\n    double prefix_sum = 0.0;\n    for (int j = 0; j <= i; ++j) { prefix_sum += x(j); }\n    sum += prefix_sum;\n  });\n\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Sum<double>;\n\n  auto prefix_sum = Kokkos::scan(Policy(0, x.extent(0)),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  },\n                                  Reducer(0));\n\n  return Kokkos::parallel_reduce(Policy(0, x.extent(0)),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) {\n                                   sum += prefix_sum(i);\n                                 },\n                                 Reducer(0));\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n  using Kokkos::parallel_scan;\n  using Kokkos::TeamPolicy;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      TeamPolicy<>(x.extent(0), Kokkos::AUTO),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      sum);\n  return sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a prefix sum view\n  View<double*> prefix_sum = create_mirror_view(x);\n\n  // Compute the prefix sum\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = prefix_sum(i - 1) + x(i);\n    }\n    prefix_sum(i) = sum;\n  });\n\n  // Compute the sum of the prefix sum array\n  double sum_of_prefix_sum = 0.0;\n  parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& sum) {\n    sum += prefix_sum(i);\n  }, sum_of_prefix_sum);\n\n  return sum_of_prefix_sum;\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using ViewType = Kokkos::View<double*, Policy>;\n\n  const size_t n = x.extent(0);\n  ViewType prefix_sum(\"prefix_sum\", n);\n\n  parallel_for(\"prefix_sum\", Policy(0, n), KOKKOS_LAMBDA(const size_t& i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n\n  return prefix_sum(n - 1);\n}\n",
+            "\n  Kokkos::View<double*> prefixSum = Kokkos::View<double*>(x.extent(0));\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = prefixSum(i - 1) + x(i);\n    }\n  }, prefixSum);\n  double sum = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += prefixSum(i);\n  }, sum);\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.extent(0));\n  Kokkos::parallel_for(\"prefix_sum_kernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      prefix_sum(i) = x(i);\n    } else {\n      prefix_sum(i) = prefix_sum(i - 1) + x(i);\n    }\n  });\n  return Kokkos::parallel_reduce(\"sum_prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i, double& sum) {\n    sum += prefix_sum(i);\n  }, 0.0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto prefix_sum = Kokkos::scan(policy_type(0, x.size()), tag_type(), x,\n                                 [=](const double& a, const double& b) { return a + b; });\n\n  return Kokkos::parallel_reduce(policy_type(0, x.size()),\n                                 KOKKOS_LAMBDA(const int& i, double& sum) { sum += prefix_sum(i); },\n                                 0.0);\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    prefix_sum(i) = (i == 0) ? x(i) : x(i) + prefix_sum(i - 1);\n  });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, double& update) {\n        update += x(i);\n        sum += update;\n      },\n      tag_type());\n  return sum;\n}\n",
+            "\n\n  Kokkos::View<double*> prefix_sum(\"prefix_sum\", x.size());\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const int& i, double& sum) {\n                            if (i == 0) {\n                              sum = x(i);\n                            } else {\n                              sum = prefix_sum(i - 1) + x(i);\n                            }\n                            prefix_sum(i) = sum;\n                          });\n  return prefix_sum(x.size() - 1);\n}\n",
+            "\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reduction = Kokkos::Sum<double>;\n\n  // Compute the prefix sum array\n  View<double*> y = Kokkos::scan(Policy(0, x.size()), Reduction(0.0),\n                                  KOKKOS_LAMBDA(const int& i, double& sum) {\n                                    sum += x(i);\n                                  });\n\n  // Compute the sum of the prefix sum array\n  double sum = Kokkos::reduce(Policy(0, y.size()), Reduction(0.0),\n                               KOKKOS_LAMBDA(const int& i, double& sum) {\n                                 sum += y(i);\n                               });\n\n  return sum;\n}\n",
+            "\n  using Kokkos::parallel_reduce;\n\n  double sum = 0;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, double& update) {\n    update += x(i);\n  }, sum);\n\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n    sum += x[i];\n  }\n  return sum;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < n; i++) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] += x[i - 1];\n  }\n  for (double v : x) {\n    sum += v;\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  #pragma omp parallel for reduction(+: sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0.0;\n  std::vector<double> prefixSum(x.size(), 0.0);\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) { prefixSum[i] += prefixSum[i - 1]; }\n    sum += prefixSum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  double sum = 0;\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n\n  // Compute the prefix sum array\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  // Create a vector to store the prefix sum\n  std::vector<double> prefix_sum(x.size());\n\n  // Compute the prefix sum in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  return sum;\n}\n",
+            "\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = (i > 0 ? prefix_sum[i - 1] : 0) + x[i];\n  }\n  double sum = 0;\n  #pragma omp parallel for reduction(+ : sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n  return sum;\n}\n",
+            "\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 1; i < n; i++) {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n  double sum = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:sum)\n    for (int i = 0; i < n; i++) {\n      sum += prefix_sum[i];\n    }\n  }\n  return sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (auto xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double sum = 0;\n  for (double xi : x) {\n    sum += xi;\n  }\n  return sum;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n\n  double sum_local = 0;\n  for (int i = gid; i < N; i += bid_size) {\n    sum_local += x[i];\n  }\n\n  double sum_global;\n  hipReduce(&sum_global, &sum_local, 1, hipReduceSum, 0);\n  *sum = sum_global;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    sum[gid] = x[gid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < hipBlockDim_x; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    double prefix_sum = 0;\n    for (int i = 0; i <= gid; i++) {\n      prefix_sum += x[i];\n    }\n    sum[gid] = prefix_sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    for (int i = 1; i < N; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum[tid] += sum[tid + i];\n      }\n      __syncthreads();\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_local = x[tid];\n    __syncthreads();\n    for (int i = 1; i < blockDim.x; i *= 2) {\n      if (tid % (2 * i) == 0 && tid + i < N) {\n        sum_local += x[tid + i];\n      }\n      __syncthreads();\n    }\n    sum[tid] = sum_local;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_global = 0;\n    for (int i = 0; i < N; i++) {\n      sum_global += sum[i];\n    }\n    *sum = sum_global;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    sum[bid] += x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t stride = blockDim.x * gridDim.x;\n  double localSum = 0;\n  for (size_t i = tid; i < N; i += stride) {\n    localSum += x[i];\n  }\n  __syncthreads();\n  double globalSum = localSum;\n  for (size_t i = 1; i < blockDim.x; i++) {\n    globalSum += localSum;\n  }\n  __syncthreads();\n  *sum = globalSum;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      sum[tid + stride] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n    __syncthreads();\n    if (tid > 0) { sum[tid] += sum[tid - 1]; }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 0; i < N; i++) { sum[0] += sum[i]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "1\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3\ufe0f\u20e3\ufe0f\ufe0f\u20e3",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum\n  double sum = 0;\n  for (size_t i = 0; i < prefix_sum.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = std::accumulate(prefix_sum.begin(), prefix_sum.end(), 0.0);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n  }\n\n  double prefix_sum = 0;\n  MPI_Allreduce(&sum, &prefix_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return prefix_sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum array\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sum array\n  double sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  double sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    sum += x[i];\n    MPI_Allreduce(&sum, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  prefix_sum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  std::partial_sum(x.begin(), x.end(), prefix_sum.begin());\n\n  double sum = 0;\n  MPI_Allreduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  }\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  MPI_Allreduce(x.data(), prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  double sum = 0;\n  for (double v : prefix_sum) {\n    sum += v;\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum array of the vector x and compute its sum. Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\n__global__ void sumOfPrefixSum(const double *x, size_t N, double *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n  double sum_local = 0;\n  for (int i = tid; i < N; i += stride) {\n    sum_local += x[i];\n  }\n  __shared__ double sum_shared[blockDim.x];\n  sum_shared[tid] = sum_local;\n  __syncthreads();\n  if (tid == 0) {\n    double sum_block = 0;\n    for (int i = 0; i < blockDim.x; i++) {\n      sum_block += sum_shared[i];\n    }\n    atomicAdd(sum, sum_block);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  const int tid = threadIdx.x;\n  const int bid = blockIdx.x;\n  const int bid_size = blockDim.x;\n  const int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    double partial_sum = x[gid];\n    if (gid > 0) {\n      partial_sum += x[gid - 1];\n    }\n    sum[gid] = partial_sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = tid + bid * bid_size;\n  if (i < N) {\n    sum[i] = x[i];\n  }\n  __syncthreads();\n  for (int s = bid_size / 2; s > 0; s >>= 1) {\n    if (i < N && tid < s) {\n      sum[i] += sum[i + s];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum_prefix = 0;\n    for (int i = 0; i <= tid; i++) {\n      sum_prefix += x[i];\n    }\n    double sum_suffix = 0;\n    for (int i = tid + 1; i < N; i++) {\n      sum_suffix += x[i];\n    }\n    sum[tid] = sum_prefix + sum_suffix;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double partial_sum = 0;\n    for (int i = 0; i <= tid; i++) { partial_sum += x[i]; }\n    sum[tid] = partial_sum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double total_sum = 0;\n    for (int i = 0; i < N; i++) { total_sum += sum[i]; }\n    *sum = total_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    for (int i = 1; i < N; i++) {\n      sum[0] += sum[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double prefixSum = 0;\n    for (int i = 0; i <= tid; i++) {\n      prefixSum += x[i];\n    }\n    sum[tid] = prefixSum;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double totalSum = 0;\n    for (int i = 0; i < N; i++) {\n      totalSum += sum[i];\n    }\n    *sum = totalSum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * bid_size + tid;\n\n  double prefix_sum = 0;\n  while (i < N) {\n    prefix_sum += x[i];\n    x[i] = prefix_sum;\n    i += grid_size * bid_size;\n  }\n\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) { *sum += x[i]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = 0;\n    for (int i = 0; i < N; i++) {\n      *sum += sum[i];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  double s = 0;\n  for (int i = 0; i <= gid; i++) {\n    s += x[i];\n  }\n  sum[gid] = s;\n  if (gid > 0) {\n    sum[gid] += sum[gid - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    atomicAdd(sum, x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    double sum_temp = *sum;\n    atomicAdd(sum, sum_temp);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    sum[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      sum[tid + i] += sum[tid];\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *sum = sum[N - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "32_scan_sum_of_prefix_sum_array",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum array of the vector x and return its sum.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [-7, 2, 1, 9, 4, 8]\n   output: 15\n*/\ndouble sumOfPrefixSum(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefixSum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) return sum;\n  else return 0.0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[0], &sum, x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefix_sum[i] = x[i];\n    } else {\n      prefix_sum[i] = prefix_sum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefix_sum[x.size() - 1], 1, MPI_DOUBLE, &sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0.0;\n  double global_sum = 0.0;\n\n  // Compute the prefix sum of the local vector\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 1; i < x.size(); i++) {\n    x[i] += x[i - 1];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the global prefix sum on rank 0\n  if (rank == 0) {\n    return global_sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  double sum = 0.0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); i++) {\n    double local_sum = 0.0;\n    for (int j = 0; j <= i; j++) {\n      local_sum += x[j];\n    }\n    prefix_sum[i] = local_sum;\n  }\n\n  MPI_Reduce(&prefix_sum[0], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) {\n      prefixSum[i] += prefixSum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < prefixSum.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) prefix_sum[i] += prefix_sum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) return sum;\n  else return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums\n  double sum = 0.0;\n#pragma omp parallel for reduction(+ : sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefixSum[i];\n  }\n\n  if (rank == 0) return sum;\n  return 0;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sumOfPrefixSum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sumOfPrefixSum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sumOfPrefixSum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(prefix_sum.data(), &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (i == 0) {\n      prefixSum[i] = x[i];\n    } else {\n      prefixSum[i] = prefixSum[i - 1] + x[i];\n    }\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  MPI_Allreduce(&prefixSum[x.size() - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < n; ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of each rank's portion of x\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  // Reduce the prefix sums across all ranks\n  std::vector<double> global_prefix_sum(x.size());\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Compute the sum of the prefix sum array\n  double sum = 0.0;\n  for (int i = 0; i < x.size(); i++) {\n    sum += global_prefix_sum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> prefixSum(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    prefixSum[i] = x[i];\n    if (i > 0) prefixSum[i] += prefixSum[i - 1];\n  }\n\n  double sum = 0;\n  MPI_Reduce(&prefixSum[n - 1], &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  return sum;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> prefix_sum(x.size());\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    prefix_sum[i] = x[i];\n    if (i > 0) {\n      prefix_sum[i] += prefix_sum[i - 1];\n    }\n  }\n\n  double sum = 0;\n  #pragma omp parallel for reduction(+: sum)\n  for (int i = 0; i < x.size(); ++i) {\n    sum += prefix_sum[i];\n  }\n\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum of x on each rank\n  std::vector<double> prefixSum(x.size());\n  prefixSum[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    prefixSum[i] = prefixSum[i - 1] + x[i];\n  }\n\n  // Compute the sum of the prefix sums across all ranks\n  double sum = 0.0;\n  #pragma omp parallel for reduction(+:sum)\n  for (int i = 0; i < prefixSum.size(); i++) {\n    sum += prefixSum[i];\n  }\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return sum;\n  } else {\n    return 0.0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> prefix_sum(n);\n  std::vector<double> global_prefix_sum(n);\n\n  // Compute the prefix sum on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    prefix_sum[i] = prefix_sum[i - 1] + x[i];\n  }\n\n  // Reduce the prefix sums from all ranks\n  MPI_Allreduce(prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Return the sum of the prefix sum on rank 0\n  if (rank == 0) {\n    return global_prefix_sum[n - 1];\n  } else {\n    return 0.0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace the i-th element of the array x with the minimum value from indices 0 through i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(Kokkos::View<float*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 0; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Layout = Kokkos::LayoutRight;\n\n  Kokkos::parallel_for(\n      Kokkos::RangePolicy<ExecutionSpace, Kokkos::Rank<1>>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int i) {\n        if (i == 0) {\n          x(i) = x(i);\n        } else {\n          x(i) = Kokkos::min(x(i), x(i - 1));\n        }\n      },\n      Layout());\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"Partial Minimums\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; j++) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = Kokkos::min(x(i), Kokkos::min(x, i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"partialMinimums\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    float min = x(0);\n    for (int j = 1; j <= i; ++j) {\n      min = std::min(min, x(j));\n    }\n    x(i) = min;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "<x>",
+            "\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for shared(min)\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> mins(n);\n  mins[0] = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    mins[i] = std::min(mins[i - 1], x[i]);\n  }\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = mins[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<float> min(n);\n  min[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min[i] = std::min(min[i - 1], x[i]);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = min[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    min = std::min(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002627052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026199304
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026344298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002620712
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002627667,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026183472
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026449916,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026326926
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002630641,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026149292
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026276008,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002626567
+                    }
+                ]
+            },
+            {
+                "generated_output": "<, <,  \u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>\u51a2\u51a2\u51a28\u51a2\u51a2:>\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:>:>",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026076414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026102782
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025576372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026020408
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025564648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0025784722
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025821972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002600147
+                    }
+                ]
+            },
+            {
+                "generated_output": "s\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2cernecernecerne,\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2:\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026009408,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026227294
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025983076,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026264766
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026072724,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026117768
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026248976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026358242
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026234412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0026194374
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026178062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002592471
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  float min = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025963322,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002631995
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float min = x[0];\n    for (int j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) { min = x[i]; }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n  if (rank == 0) {\n    MPI_Reduce(&x[0], &x[0], x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&x[0], nullptr, x.size(), MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j < i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks onto rank 0\n  std::vector<float> min_values(x.size());\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    MPI_Allreduce(&x[i], &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = x[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < size; i++) {\n    MPI_Send(&x[i - 1], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      float min;\n      MPI_Recv(&min, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x[i - 1] = std::min(x[i - 1], min);\n    }\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Bcast(&x[i - 1], 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 1; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      if (x[j] < min) {\n        min = x[j];\n      }\n    }\n    MPI_Allreduce(&min, &x[i], 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\n__global__ void partialMinimums(float *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    float min = x[0];\n    for (size_t j = 1; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[tid];\n    for (int i = 0; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  __shared__ float min;\n\n  if (tid == 0) {\n    min = x[bid * bid_size];\n  }\n\n  __syncthreads();\n\n  for (int i = bid * bid_size + tid; i < N; i += bid_size) {\n    min = fminf(min, x[i]);\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < size) {\n    float min = x[i];\n    for (int j = i + 1; j < size; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n\n  if (gid < N) {\n    float min = x[0];\n    for (int i = 1; i <= gid; i++) {\n      min = fminf(min, x[i]);\n    }\n    x[gid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  if (tid == 0) {\n    x[bid * bid_size] = x[bid * bid_size];\n  } else {\n    x[bid * bid_size + tid] = fminf(x[bid * bid_size + tid], x[bid * bid_size + tid - 1]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float min = x[0];\n    for (int i = 1; i <= tid; i++) {\n      if (x[i] < min) {\n        min = x[i];\n      }\n    }\n    x[tid] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n\n  if (i < N) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = fminf(min, x[j]);\n    }\n    x[i] = min;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "31_scan_scan_with_min_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Examples:\n\n   input: [8, 6, -1, 7, 3, 4, 4]\n   output: [8, 6, -1, -1, -1, -1, -1]\n\n   input: [5, 4, 6, 4, 3, 6, 1, 1]\n   output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector<float> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct partial minimums\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks to rank 0\n  MPI_Gather(&x[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete list of partial minimums\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = std::min(x[i], x[i - 1]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float *x_arr = &x[0];\n  float *min_arr = new float[n];\n\n  // Compute the minimums for each rank\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    float min = x_arr[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x_arr[j]);\n    }\n    min_arr[i] = min;\n  }\n\n  // Gather the minimums from all ranks\n  MPI_Gather(min_arr, n, MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x_arr[i] = min_arr[i];\n    }\n  }\n\n  delete[] min_arr;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<float> min_values(n);\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    min_values[i] = x[i];\n    for (int j = i + 1; j < n; j++) {\n      min_values[i] = std::min(min_values[i], x[j]);\n    }\n  }\n  MPI_Allgather(&min_values[0], n, MPI_FLOAT, &x[0], n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n\n  #pragma omp parallel for schedule(static)\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min_val = x[0];\n\n  #pragma omp parallel for reduction(min : min_val)\n  for (int i = 1; i < n; i++) {\n    min_val = std::min(min_val, x[i]);\n  }\n\n  MPI_Bcast(&min_val, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::min(x[i], min_val);\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_FLOAT, x.data(), n, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(x.data(), n, MPI_FLOAT, NULL, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> globalMinimums(x.size());\n  MPI_Allgather(x.data(), x.size(), MPI_FLOAT, globalMinimums.data(), x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the minimums on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < x.size(); i++) {\n      x[i] = std::min(x[i], globalMinimums[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n  std::vector<float> global_min(n);\n\n  // Compute local minimums\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  // Reduce local minimums to global minimums\n  MPI_Allreduce(local_min.data(), global_min.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Store global minimums on rank 0\n  if (rank == 0) {\n    x = global_min;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j < i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allgather(localMin.data(), n, MPI_FLOAT, x.data(), n, MPI_FLOAT, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> local_min(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    local_min[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      local_min[i] = std::min(local_min[i], x[j]);\n    }\n  }\n\n  MPI_Reduce(local_min.data(), x.data(), n, MPI_FLOAT, MPI_MIN, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_FLOAT, &x[0], x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank != 0) {\n    x.clear();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float localMin = x[0];\n  float globalMin;\n\n  #pragma omp parallel for reduction(min : localMin)\n  for (int i = 1; i < n; i++) {\n    localMin = std::min(localMin, x[i]);\n  }\n\n  MPI_Allreduce(&localMin, &globalMin, 1, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    for (int i = 1; i < n; i++) {\n      x[i] = globalMin;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  float min = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    if (x[i] < min) {\n      min = x[i];\n    }\n    x[i] = min;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial minimums in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    float min = x[i];\n    for (int j = 0; j <= i; j++) {\n      min = std::min(min, x[j]);\n    }\n    x[i] = min;\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> partialMins(x.size() * numRanks);\n  MPI_Gather(x.data(), x.size(), MPI_FLOAT, partialMins.data(), x.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the result in x\n  if (rank == 0) {\n    x.clear();\n    for (int i = 0; i < partialMins.size(); i += numRanks) {\n      x.push_back(partialMins[i]);\n    }\n  }\n}\n",
+            "<,  Synag",
+            "\n\n  int n = x.size();\n  int nthreads = omp_get_num_threads();\n  int tid = omp_get_thread_num();\n  int chunk = n / nthreads;\n  int start = tid * chunk;\n  int end = start + chunk;\n  if (tid == nthreads - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    x[i] = std::min(x[i], *std::min_element(x.begin(), x.begin() + i));\n  }\n  omp_barrier();\n  if (tid == 0) {\n    for (int i = chunk; i < n; i += chunk) {\n      for (int j = 0; j < chunk; j++) {\n        x[i + j] = std::min(x[i + j], x[i + j - chunk]);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<float> localMin(n);\n  std::vector<float> globalMin(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    localMin[i] = x[i];\n    for (int j = 0; j <= i; j++) {\n      localMin[i] = std::min(localMin[i], x[j]);\n    }\n  }\n\n  MPI_Allreduce(localMin.data(), globalMin.data(), n, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    x = globalMin;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial minimums on each rank\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    x[i] = std::min(x[i], x[i - 1]);\n  }\n\n  // Gather the partial minimums from all ranks\n  std::vector<float> min_values(x.size(), 0.0);\n  MPI_Allgather(&x[0], x.size(), MPI_FLOAT, &min_values[0], x.size(), MPI_FLOAT, MPI_COMM_WORLD);\n\n  // Update the values on rank 0\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      x[i] = std::min(x[i], min_values[i]);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(Kokkos::View<int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  const int n = x.extent(0);\n  View<int*> max_so_far = \"max_so_far\";\n  View<int*> max_ending_here = \"max_ending_here\";\n  Kokkos::parallel_for(\"max_subarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                       KOKKOS_LAMBDA(const int& i) {\n                         if (i == 0) {\n                           max_ending_here[i] = x[i];\n                           max_so_far[i] = x[i];\n                         } else {\n                           max_ending_here[i] = std::max(x[i], max_ending_here[i - 1] + x[i]);\n                           max_so_far[i] = std::max(max_so_far[i - 1], max_ending_here[i]);\n                         }\n                       });\n  return max_so_far[n - 1];\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::View<int*> dp = Kokkos::View<int*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i == 0) {\n      dp(i) = x(i);\n    } else {\n      dp(i) = std::max(dp(i - 1) + x(i), x(i));\n    }\n  });\n  return *Kokkos::max_element(dp);\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); ++i) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int max_sum = 0;\n  int current_sum = 0;\n  Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& max_sum) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }, max_sum, tag_type());\n  return max_sum;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  Kokkos::parallel_scan(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far, int& max_ending_here) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array.\n  int max_sum = x(0);\n\n  // Iterate over the remaining elements of the array.\n  for (int i = 1; i < x.extent(0); i++) {\n    // If the current element is greater than the sum of the previous element and\n    // the maximum sum, then set the maximum sum to the current element.\n    if (x(i) > max_sum + x(i - 1)) {\n      max_sum = x(i);\n    } else {\n      // Otherwise, add the current element to the maximum sum.\n      max_sum += x(i);\n    }\n  }\n\n  // Return the maximum sum.\n  return max_sum;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_for(\n      \"maximumSubarray\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const int& i) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_reduce(Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(x.extent(0)),\n                          Kok\u30b3\u30af::plus<int>(),\n                          Kok\u30b3\u30af::make_iota_view(x.extent(0)),\n                          [&](int i) {\n                            return Kok\u30b3\u30af::parallel_reduce(\n                                Kok\u30b3\u30af::Kok\u30b3\u30af_VectorRange(i, x.extent(0)),\n                                Kok\u30b3\u30af::plus<int>(),\n                                x[i],\n                                [&](int j) { return x[j]; });\n                          });\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(int i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Max<int>;\n  using reduction_functor_type = Kokkos::Max<int, tag_type>;\n  using reduction_policy_type = Kokkos::Scan<policy_type, reduction_functor_type>;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  Kokkos::parallel_reduce(\n      reduction_policy_type(x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n        max_ending_here = std::max(x(i), max_ending_here + x(i));\n        max_so_far = std::max(max_so_far, max_ending_here);\n      },\n      max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_ending_here, int& max_so_far) {\n                            max_ending_here = std::max(x(i), max_ending_here + x(i));\n                            max_so_far = std::max(max_so_far, max_ending_here);\n                          },\n                          max_ending_here, max_so_far);\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, int& max_so_far) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int max_ending_here = x(0);\n  for (int i = 1; i < x.extent(0); i++) {\n    max_ending_here = std::max(x(i), max_ending_here + x(i));\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& max_so_far) {\n    curr_max = std::max(x(i), curr_max + x(i));\n    max_so_far = std::max(max_so_far, curr_max);\n  });\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x(0);\n  int curr_max = x(0);\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(int i, int& max_so_far, int& curr_max) {\n                            curr_max = std::max(x(i), curr_max + x(i));\n                            max_so_far = std::max(max_so_far, curr_max);\n                          },\n                          max_so_far, curr_max);\n\n  return max_so_far;\n}\n",
+            "\n\n  // Initialize the maximum sum to the first element of the array\n  int max_sum = x(0);\n\n  // Initialize the current sum to the first element of the array\n  int current_sum = x(0);\n\n  // Loop through the remaining elements of the array\n  for (int i = 1; i < x.extent(0); i++) {\n    // Update the current sum by adding the current element\n    current_sum = std::max(x(i), current_sum + x(i));\n\n    // Update the maximum sum if the current sum is greater\n    max_sum = std::max(max_sum, current_sum);\n  }\n\n  // Return the maximum sum\n  return max_sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); i++) {\n    sum += x[i];\n    max_sum = std::max(max_sum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    max_ending_here = std::max(x[i], max_ending_here + x[i]);\n    max_so_far = std::max(max_so_far, max_ending_here);\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0, max_ending_here = 0;\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (size_t i = 0; i < x.size(); ++i) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  #pragma omp parallel for reduction(max:max_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    current_sum = std::max(x[i], current_sum + x[i]);\n    max_sum = std::max(max_sum, current_sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < x.size(); i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n      if (max_ending_here < 0) max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int n = x.size();\n  #pragma parallel for reduction(+:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = INT_MIN;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "ontiguous, Synag",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < x.size(); i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int max_so_far = x[0];\n  int curr_max = x[0];\n\n  for (int i = 1; i < x.size(); i++) {\n    curr_max = std::max(x[i], curr_max + x[i]);\n    max_so_far = std::max(max_so_far, curr_max);\n  }\n  return max_so_far;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = hipMax(x[i], max_ending_here + x[i]);\n    max_so_far = hipMax(max_so_far, max_ending_here);\n  }\n  hipAtomicMax(sum, &max_so_far);\n}\n",
+            "\n\n  int tid = get_global_id(0);\n  if (tid >= N) return;\n\n  int max_ending_here = x[tid];\n  int max_so_far = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int stride = block.thread.x + block.thread.y * block.dim.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int s = 0;\n  int max = 0;\n  for (size_t i = tid; i < N; i += blockDim.x * gridDim.x) {\n    s = max(x[i], s + x[i]);\n    max = max(max, s);\n  }\n  atomicAdd(sum, max);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = start; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int tid_offset = tid + bid * bid_size;\n  int sum_local = 0;\n  int max_sum = 0;\n  if (tid_offset < N) {\n    sum_local = x[tid_offset];\n  }\n  __syncthreads();\n  for (int i = tid_offset + bid_size; i < N; i += bid_size) {\n    sum_local = max(sum_local + x[i], x[i]);\n    max_sum = max(max_sum, sum_local);\n  }\n  if (tid == 0) {\n    *sum = max_sum;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __syncthreads();\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (i < N) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    atomicAdd(sum, max_so_far);\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bsize = hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int stride = bsize * hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N && i < start + stride; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  if (tid == 0) *sum = max_so_far;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipBlockDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + stride; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int stride = hipGridDim_x;\n  int start = tid + bid * stride;\n  int max_so_far = x[start];\n  int max_ending_here = x[start];\n  for (int i = start + 1; i < N; i += stride) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  hipAtomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n  int stride = bid_size * hipGridDim_x;\n\n  int local_max = 0;\n  int global_max = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    local_max = max(x[i], local_max + x[i]);\n    global_max = max(global_max, local_max);\n  }\n\n  if (tid == 0) {\n    *sum = global_max;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_sum = x[0];\n  int sum = x[0];\n  for (int i = 1; i < n; i++) {\n    sum = std::max(x[i], sum + x[i]);\n    max_sum = std::max(max_sum, sum);\n  }\n  return max_sum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "(4, -1, 2, 1) = 6.",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  int maxSum = INT_MIN;\n  int sum = 0;\n  for (int i = myRank; i < x.size(); i += numRanks) {\n    sum += x[i];\n    maxSum = std::max(maxSum, sum);\n    if (sum < 0) {\n      sum = 0;\n    }\n  }\n\n  MPI_Allreduce(&maxSum, &maxSum, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return maxSum;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) max_so_far = max_ending_here;\n    if (max_ending_here < 0) max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Store the result in sum.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\n__global__ void maximumSubarray(const int *x, size_t N, int *sum) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Each thread computes the sum of its subarray\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int partialSum = 0;\n  for (int i = tid; i < N; i += blockDim.x * gridDim.x) {\n    partialSum += x[i];\n  }\n\n  // Each thread writes its partial sum to global memory\n  __shared__ int partialSums[blockDim.x];\n  partialSums[threadIdx.x] = partialSum;\n  __syncthreads();\n\n  // Each thread computes the maximum of its partial sum and the partial sums of the previous threads\n  for (int i = blockDim.x / 2; i > 0; i >>= 1) {\n    if (threadIdx.x < i) {\n      partialSums[threadIdx.x] = max(partialSums[threadIdx.x], partialSums[threadIdx.x + i]);\n    }\n    __syncthreads();\n  }\n\n  // The first thread writes the maximum sum to global memory\n  if (threadIdx.x == 0) {\n    *sum = partialSums[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n\n  int maxSoFar = x[tid];\n  int maxEndingHere = x[tid];\n\n  for (int i = tid + blockDim; i < N; i += blockDim * gridDim) {\n    maxEndingHere = maxEndingHere + x[i];\n    if (maxSoFar < maxEndingHere) {\n      maxSoFar = maxEndingHere;\n    }\n    if (maxEndingHere < 0) {\n      maxEndingHere = 0;\n    }\n  }\n\n  int maxOverall = maxSoFar;\n\n  if (blockId == 0) {\n    for (int i = 1; i < gridDim; i++) {\n      int temp = maxOverall;\n      maxOverall = max(maxOverall, maxSoFar);\n      maxSoFar = max(temp, maxSoFar);\n    }\n  }\n\n  if (blockId == 0) {\n    *sum = maxOverall;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int start = block_id * block_size + tid;\n  int end = min(start + block_size, N);\n  int max_sum = INT_MIN;\n  int current_sum = 0;\n  for (int i = start; i < end; i++) {\n    current_sum = max(x[i], current_sum + x[i]);\n    max_sum = max(max_sum, current_sum);\n  }\n  sum[tid] = max_sum;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block_size = blockDim.x;\n  int block_id = blockIdx.x;\n  int block_start = block_id * block_size;\n  int block_end = min(block_start + block_size, N);\n  int max_so_far = x[block_start];\n  int max_ending_here = x[block_start];\n  for (int i = block_start + 1; i < block_end; i++) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  __shared__ int shared_max[block_size];\n  shared_max[tid] = max_so_far;\n  __syncthreads();\n  if (tid == 0) {\n    int max_overall = shared_max[0];\n    for (int i = 1; i < block_size; i++) {\n      max_overall = max(max_overall, shared_max[i]);\n    }\n    *sum = max_overall;\n  }\n}\n",
+            "\n\n  int max_ending_here = x[0];\n  int max_so_far = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int stride = blockDim.x;\n  int start = tid + block * stride;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int sum = 0;\n  int maxSum = -2147483648;\n  for (int i = tid; i < N; i += blockIdx.x * blockDim.x) {\n    sum += x[i];\n    maxSum = max(maxSum, sum);\n    if (sum < 0) sum = 0;\n  }\n  sum = maxSum;\n  *sum = maxSum;\n}\n",
+            "\n\n  int my_sum = 0;\n  int max_sum = 0;\n  int max_sum_so_far = 0;\n  int start = 0;\n  int end = 0;\n  int s = threadIdx.x;\n  int e = threadIdx.x + blockDim.x - 1;\n  if (e >= N) e = N - 1;\n  for (int i = s; i <= e; i++) {\n    my_sum += x[i];\n    if (my_sum > max_sum_so_far) {\n      max_sum_so_far = my_sum;\n      start = s;\n      end = i;\n    }\n    if (my_sum < 0) {\n      my_sum = 0;\n    }\n  }\n  if (max_sum_so_far > max_sum) {\n    max_sum = max_sum_so_far;\n    *sum = max_sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    max_ending_here = max(max_ending_here + x[i], x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  for (int i = tid; i < N; i += blockDim.x) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  atomicAdd(sum, max_so_far);\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; ++i) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int stride = blockDim * gridDim;\n  int start = tid + blockId * blockDim;\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  while (start < N) {\n    max_ending_here = max_ending_here + x[start];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n    start += stride;\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int max_so_far = x[tid];\n  int max_ending_here = x[tid];\n  for (int i = tid + 1; i < N; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) { max_so_far = max_ending_here; }\n    if (max_ending_here < 0) { max_ending_here = 0; }\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int max_so_far = x[0];\n  int max_ending_here = x[0];\n  for (size_t i = 1; i < N; ++i) {\n    max_ending_here = max(x[i], max_ending_here + x[i]);\n    max_so_far = max(max_so_far, max_ending_here);\n  }\n  *sum = max_so_far;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int sum_so_far = 0;\n  int max_sum = 0;\n  if (tid < N) {\n    sum_so_far = x[tid];\n    max_sum = x[tid];\n  }\n  __syncthreads();\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    sum_so_far = fmax(x[i], sum_so_far + x[i]);\n    max_sum = fmax(max_sum, sum_so_far);\n    __syncthreads();\n  }\n  if (tid == 0) { *sum = max_sum; }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  for (int i = tid; i < N; i += stride) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  sum[bid] = max_so_far;\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(x[i], max_ending_here + x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int max_ending_here = x[tid];\n    int max_so_far = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      max_ending_here = max(max_ending_here + x[i], x[i]);\n      max_so_far = max(max_so_far, max_ending_here);\n    }\n    sum[tid] = max_so_far;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "34_scan_largest_contiguous_subarray_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the largest sum of any contiguous subarray in the vector x.\n   i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n   subarray with the largest sum of 6.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n   output: 6\n*/\nint maximumSubarray(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == num_ranks - 1) {\n    end = n;\n  }\n#pragma omp parallel for reduction(max : max_ending_here)\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  MPI_Allreduce(&max_so_far, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_so_far;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk = n / nprocs;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int max_global;\n  MPI_Reduce(&max_so_far, &max_global, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_global;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n#pragma omp parallel\n  {\n    int max_so_far_local = 0, max_ending_here_local = 0;\n#pragma omp for\n    for (int i = start; i < end; i++) {\n      max_ending_here_local = max_ending_here_local + x[i];\n      if (max_so_far_local < max_ending_here_local) {\n        max_so_far_local = max_ending_here_local;\n      }\n      if (max_ending_here_local < 0) {\n        max_ending_here_local = 0;\n      }\n    }\n#pragma omp critical\n    {\n      if (max_so_far < max_so_far_local) {\n        max_so_far = max_so_far_local;\n      }\n    }\n  }\n  MPI_Reduce(&max_so_far, &max_ending_here, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return max_ending_here;\n  }\n  return 0;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for\n#pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int local_max_so_far = INT_MIN, local_max_ending_here = 0;\n    #pragma omp for\n    for (int i = rank; i < n; i += nprocs) {\n      local_max_ending_here = local_max_ending_here + x[i];\n      if (local_max_so_far < local_max_ending_here)\n        local_max_so_far = local_max_ending_here;\n      if (local_max_ending_here < 0)\n        local_max_ending_here = 0;\n    }\n    #pragma omp critical\n    {\n      if (max_so_far < local_max_so_far)\n        max_so_far = local_max_so_far;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int max_local = 0;\n  int max_global = 0;\n\n  #pragma omp parallel for reduction(max : max_local)\n  for (int i = 0; i < n; i++) {\n    max_local = std::max(x[i], max_local + x[i]);\n  }\n\n  MPI_Allreduce(&max_local, &max_global, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return max_global;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel num_threads(4)\n  {\n    #pragma omp for reduction(max:max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  int global_max = max_so_far;\n  MPI_Allreduce(&global_max, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallelization\n  #pragma omp parallel\n  {\n    // Each thread computes the maximum subarray sum for its portion of the array\n    #pragma omp for reduction(max : max_so_far)\n    for (int i = 0; i < n; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here)\n        max_so_far = max_ending_here;\n      if (max_ending_here < 0)\n        max_ending_here = 0;\n    }\n  }\n\n  // MPI reduction to find the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Allreduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n    int chunk_size = n / num_threads;\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  int chunk_size = n / num_ranks;\n  int start = my_rank * chunk_size;\n  int end = start + chunk_size;\n  if (my_rank == num_ranks - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // Reduce the maximum subarray sum across all ranks\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // OpenMP parallel for loop\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  // MPI reduce to get the maximum subarray sum\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n  for (int i = start; i < end; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n  return global_max;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = 0;\n  int max_ending_here = 0;\n\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here) {\n      max_so_far = max_ending_here;\n    }\n    if (max_ending_here < 0) {\n      max_ending_here = 0;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &max_so_far, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return max_so_far;\n}\n",
+            "\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max : max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n\n  // Iterate through the array\n  #pragma omp parallel for reduction(max:max_so_far)\n  for (int i = 0; i < n; i++) {\n    max_ending_here = max_ending_here + x[i];\n    if (max_so_far < max_ending_here)\n      max_so_far = max_ending_here;\n\n    if (max_ending_here < 0)\n      max_ending_here = 0;\n  }\n  return max_so_far;\n}\n",
+            "\n\n  int n = x.size();\n  int max_so_far = INT_MIN, max_ending_here = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    int chunk = n / nthreads;\n    int start = tid * chunk;\n    int end = (tid + 1) * chunk;\n    if (tid == nthreads - 1) {\n      end = n;\n    }\n    for (int i = start; i < end; i++) {\n      max_ending_here = max_ending_here + x[i];\n      if (max_so_far < max_ending_here) {\n        max_so_far = max_ending_here;\n      }\n      if (max_ending_here < 0) {\n        max_ending_here = 0;\n      }\n    }\n  }\n\n  int global_max;\n  MPI_Reduce(&max_so_far, &global_max, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_max;\n  }\n  return 0;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the prefix sum of the vector x into output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(Kokkos::View<const double*> const& x, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const int i) {\n                        if (i == 0) {\n                          output[i] = x[i];\n                        } else {\n                          output[i] = output[i - 1] + x[i];\n                        }\n                      });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int& i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output);\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = output(i - 1) + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, double& update) {\n    update += x(i);\n    output(i) = update;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i, double& sum) {\n    if (i == 0) {\n      sum = x(i);\n    } else {\n      sum = sum + x(i);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, output.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n                          KOKKOS_LAMBDA(const int i, double& update) {\n                            if (i == 0) {\n                              update = x(i);\n                            } else {\n                              update = output(i - 1) + x(i);\n                            }\n                          },\n                          output(0));\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t n = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, n), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"prefixSum\", output.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i, double& update) {\n    if (i == 0) {\n      update = x(i);\n    } else {\n      update = output(i - 1) + x(i);\n    }\n  }, output(0));\n}\n",
+            "Kok",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefix_sum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output[i] = x[i];\n    } else {\n      output[i] = output[i - 1] + x[i];\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  const size_t size = x.extent(0);\n  output(0) = x(0);\n  parallel_for(policy_type(1, size), KOKKOS_LAMBDA(const size_t i) {\n    output(i) = output(i - 1) + x(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"prefixSum\", Kokkos::Range(0, N), KOKKOS_LAMBDA (const int i) {\n    if (i == 0) {\n      output(i) = x(i);\n    } else {\n      output(i) = output(i - 1) + x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  output[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "<",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  output.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    double sum = 0;\n    for (int j = 0; j <= i; j++) {\n      sum += x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (std::size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0096776762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009417126
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0092216922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0089753424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0096526048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095806276
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0093630622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091592646
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0093876824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009315777
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0094160338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00924505
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.009776474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0095173384
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0094610944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009362559
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0092747758,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0091114688
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0097181422,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094699154
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0094870136,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092509274
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0096938042,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.009475239
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (int i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.009913023,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098085504
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0097337074,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0094683342
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); i++) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0094376002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093091406
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.009589947,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093876402
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0094795694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0093271666
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output.resize(x.size());\n  output[0] = x[0];\n  for (size_t i = 1; i < x.size(); ++i) {\n    output[i] = output[i - 1] + x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0093891128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0092556338
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double* x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n    if (tid > 0) { output[tid] += output[tid - 1]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  \n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  \n  __syncthreads();\n  \n  for (int stride = 1; stride < N; stride *= 2) {\n    if (gid < N && gid + stride < N) {\n      output[gid + stride] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bsize = blockDim.x;\n  int stride = bsize * bid;\n  int i = stride + tid;\n  if (i < N) {\n    output[i] = x[i];\n    if (i > 0) { output[i] += output[i - 1]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid + i] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = x[idx];\n    if (idx > 0) {\n      output[idx] += output[idx - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  double global_sum = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n    output.push_back(global_sum);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    MPI_Allreduce(&local_sum, &output[i], 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < x.size(); i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partialSums(size);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partialSums[0];\n    for (size_t i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partialSums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0;\n  for (int i = 0; i < x.size(); i++) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; i++) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the prefix sum on each rank\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Reduce the local sums to rank 0\n  MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  for (int i = 1; i < size; ++i) {\n    MPI_Reduce(output.data(), output.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allgather(&local_sum, 1, MPI_DOUBLE, &global_sum, 1, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (int i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] + x[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n  MPI_Allreduce(x.data(), partial_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    output = partial_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double local_sum = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  double global_sum;\n  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = global_sum;\n    for (size_t i = 1; i < x.size(); ++i) {\n      output[i] = output[i - 1] - x[i - 1];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the prefix sum of the vector x into output.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\n__global__ void prefixSum(const double *x, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (gid % (2 * i) == 0 && gid + i < N) {\n      output[gid] += output[gid + i];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    output[idx] = 0;\n    for (int i = 0; i <= idx; i++) {\n      output[idx] += x[i];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 2; stride <= N; stride *= 2) {\n    if (tid % stride == 0 && tid + stride / 2 < N) {\n      output[tid] += output[tid + stride / 2];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the first bid_size elements in x\n  double sum = 0;\n  for (int i = tid; i < bid_size; i++) {\n    sum += x[i];\n  }\n\n  // Store the prefix sum in the output array\n  output[tid] = sum;\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the remaining elements in x\n  for (int i = bid_size; i < N; i += bid_size) {\n    // Load the prefix sum of the previous block\n    sum = output[i - bid_size];\n\n    // Compute the prefix sum of the current block\n    for (int j = tid; j < bid_size; j++) {\n      sum += x[i + j];\n    }\n\n    // Store the prefix sum in the output array\n    output[i + tid] = sum;\n\n    // Synchronize the threads in the block\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  // Compute the prefix sum\n  if (gid == 0) {\n    output[0] = x[0];\n  } else {\n    output[gid] = output[gid - 1] + x[gid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n\n  __syncthreads();\n\n  for (int offset = 1; offset < N; offset *= 2) {\n    if (gid + offset < N) {\n      output[gid + offset] += output[gid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid % (2 * i) == 0 && tid + i < N) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Each thread computes the prefix sum of a subset of the elements in x.\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n\n  // Compute the prefix sum of the current block\n  double block_sum = 0;\n  for (int i = tid; i < N; i += bid_size) {\n    block_sum += x[i];\n    output[i] = block_sum;\n  }\n\n  // Synchronize the threads in the block\n  __syncthreads();\n\n  // Compute the prefix sum of the previous blocks\n  if (tid == 0) {\n    for (int i = bid + 1; i < N; i += bid_size) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int i = 1; i < N; i *= 2) {\n    if (tid < N - i) {\n      output[tid + i] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "(",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int size = blockDim.x;\n  int stride = size * gridDim.x;\n  int start = tid + bid * stride;\n  double sum = 0;\n  for (int i = start; i < N; i += stride) {\n    sum += x[i];\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid % (2 * stride) == 0 && tid + stride < N) {\n      output[tid] += output[tid + stride];\n    }\n    __syncthreads();\n  }\n}\n",
+            "_t",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    output[tid] = x[tid];\n  }\n  __syncthreads();\n  for (int stride = 1; stride < N; stride *= 2) {\n    if (tid < N - stride) {\n      output[tid + stride] += output[tid];\n    }\n    __syncthreads();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "scan",
+        "language": "cpp",
+        "name": "30_scan_prefix_sum",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the prefix sum of the vector x into output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in output on rank 0.\n   Example:\n   \n   input: [1, 7, 4, 6, 6, 2]\n   output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector<double> const& x, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Gather the local prefix sums\n  std::vector<double> global_sum(x.size(), 0);\n  MPI_Allreduce(output.data(), global_sum.data(), x.size(), MPI_DOUBLE, MPI_SUM);\n\n  // Store the global prefix sum on rank 0\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  // Compute the local prefix sum\n  std::vector<double> local_prefix_sum(n);\n  local_prefix_sum[0] = x[0];\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_prefix_sum[i] = local_prefix_sum[i - 1] + x[i];\n  }\n  // Compute the global prefix sum\n  std::vector<double> global_prefix_sum(n);\n  MPI_Allreduce(local_prefix_sum.data(), global_prefix_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n  // Store the result on rank 0\n  if (rank == 0) {\n    output = global_prefix_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n  }\n\n  for (int i = 1; i < size; i++) {\n    MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    MPI_Bcast(output.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    #pragma omp parallel for\n    for (int j = 1; j < n; j++) {\n      output[j] += output[j - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes the sum of its local data\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+: local_sum)\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n  }\n\n  // Gather the local sums into the output vector on rank 0\n  MPI_Gather(&local_sum, 1, MPI_DOUBLE, output.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Rank 0 computes the prefix sum\n  if (rank == 0) {\n    for (size_t i = 1; i < size; ++i) {\n      output[i] += output[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the partial sum on each rank\n  double partialSum = 0;\n  #pragma omp parallel for reduction(+:partialSum)\n  for (int i = 0; i < x.size(); i++) {\n    partialSum += x[i];\n  }\n\n  // Gather the partial sums from all ranks\n  std::vector<double> partialSums(numRanks);\n  MPI_Gather(&partialSum, 1, MPI_DOUBLE, partialSums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (myRank == 0) {\n    output[0] = 0;\n    for (int i = 1; i < numRanks; i++) {\n      output[i] = output[i - 1] + partialSums[i - 1];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to get global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct output\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce local prefix sums to global prefix sum\n  MPI_Reduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n, 0);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  std::vector<double> global_sum(n, 0);\n  MPI_Allreduce(partial_sum.data(), global_sum.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    output = global_sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> local_output(n);\n\n  // Compute the local prefix sum\n  #pragma omp parallel for\n  for (int i = 1; i < n; i++) {\n    local_output[i] = local_output[i - 1] + x[i];\n  }\n\n  // Reduce the local prefix sums to get the global prefix sum\n  MPI_Allreduce(local_output.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (rank == 0) {\n    output[0] = x[0];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial sum on each rank\n  double partial_sum = 0.0;\n  #pragma omp parallel for reduction(+:partial_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    partial_sum += x[i];\n  }\n\n  // Gather the partial sums on rank 0\n  std::vector<double> partial_sums(size);\n  MPI_Gather(&partial_sum, 1, MPI_DOUBLE, partial_sums.data(), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the prefix sum on rank 0\n  if (rank == 0) {\n    output.resize(x.size());\n    output[0] = partial_sums[0];\n    for (int i = 1; i < size; ++i) {\n      output[i] = output[i - 1] + partial_sums[i];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<double> partial_sum(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    partial_sum[i] = x[i];\n    if (i > 0) {\n      partial_sum[i] += partial_sum[i - 1];\n    }\n  }\n\n  MPI_Reduce(partial_sum.data(), output.data(), n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0.0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); ++i) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), output.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local prefix sum\n  double local_sum = 0;\n  #pragma omp parallel for reduction(+:local_sum)\n  for (int i = 0; i < x.size(); i++) {\n    local_sum += x[i];\n    output[i] = local_sum;\n  }\n\n  // Compute the global prefix sum\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), x.size(), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) *= -1;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int &xi = x(i);\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    int val = x(i);\n    if (val % 2 == 0) {\n      x(i) = val / 2;\n    } else {\n      x(i) = -val;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) = x(i) / 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"negateOddsAndHalveEvens\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kok koste::parallel_for(0, x.size(), KOKKOS_LAMBDA(const int i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) % 2 == 0) {\n      x(i) /= 2;\n    } else {\n      x(i) = -x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma\u00a0omp\u00a0parallel\u00a0for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] *= -1;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "serial",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013851124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013836018
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013563024,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013653496
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001371056,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013708936
+                    }
+                ]
+            },
+            {
+                "generated_output": "negate,  Synag",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013742802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013743182
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013308344,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013907974
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013641048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001350273
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013492632,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013582862
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013781052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013676438
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013739758,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013845154
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013518622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013791794
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013569394,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001388623
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &xi : x) {\n    if (xi % 2 == 0) {\n      xi /= 2;\n    } else {\n      xi = -xi;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013652772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013866914
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013623578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013578154
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &v : x) {\n    if (v % 2 == 0) {\n      v /= 2;\n    } else {\n      v = -v;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001347694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013835072
+                    }
+                ]
+            },
+            {
+                "generated_output": "negate,  Synag",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    if (i % 2 == 0) {\n      i /= 2;\n    } else {\n      i = -i;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013728498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013783938
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013449612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013697958
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0013500626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013691002
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &val : x) {\n    if (val % 2 == 0) {\n      val /= 2;\n    } else {\n      val = -val;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001383279,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0013754984
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "hip",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx();\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "E",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0,\n               MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0,\n             MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Negate odd values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    }\n  }\n\n  // Halve even values\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather results on rank 0\n  if (rank == 0) {\n    std::vector<int> result(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_INT, result.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n    x = result;\n  } else {\n    MPI_Gather(nullptr, 0, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of x.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(x.data(), x.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its portion of the vector.\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "cuda",
+        "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\n__global__ void negateOddsAndHalveEvens(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0) {\n      x[tid] /= 2;\n    } else {\n      x[tid] = -x[tid];\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] % 2 == 0) {\n      x[idx] /= 2;\n    } else {\n      x[idx] = -x[idx];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] % 2 == 0)\n      x[tid] /= 2;\n    else\n      x[tid] = -x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "56_transform_negate_odds",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* In the vector x negate the odd values and divide the even values by 2.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [16, 11, 12, 14, 1, 0, 5]\n   output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int n_local = n / nprocs;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], n_local, MPI_INT, &x[0], n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_INT, &x[0], end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int remain = x.size() % size;\n  int local_start = rank * local_size;\n  int local_end = local_start + local_size;\n  if (rank < remain) {\n    local_end++;\n  }\n\n#pragma omp parallel for\n  for (int i = local_start; i < local_end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(MPI_IN_PLACE, local_end - local_start, MPI_INT, x.data(), local_end - local_start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, nProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nProcs);\n\n  int n = x.size();\n  int chunkSize = (n + nProcs - 1) / nProcs;\n  int begin = myRank * chunkSize;\n  int end = std::min(begin + chunkSize, n);\n\n  #pragma omp parallel for\n  for (int i = begin; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[begin], end - begin, MPI_INT, &x[0], end - begin, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] = x[i] / 2;\n    }\n  }\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the new values for its part of the vector\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 1) {\n      x[i] = -x[i];\n    } else {\n      x[i] /= 2;\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[0], x.size() / size, MPI_INT, &x[0], x.size() / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_n = n / nprocs;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, n_local, MPI_INT, x.data(), n_local, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] % 2 == 0) {\n      x[i] /= 2;\n    } else {\n      x[i] = -x[i];\n    }\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_INT, &x[0], chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(Kokkos::View<const int*> const& x, Kokkos::View<bool*> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKok\u96f9::ParallelFor(Kok\u96f9::Range(x.size()), [&](int i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const int& i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask[i] = isPowerOfTwo(x[i]);\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", mask.extent(0), KOKKOS_LAMBDA(const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n\n}\n",
+            "\n\n\tKokkos::parallel_for(\"mapPowersOfTwo\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n",
+            "\n\n\tKokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n\t\tmask(i) = isPowerOfTwo(x(i));\n\t});\n}\n",
+            ",",
+            "\n\n  Kokkos::parallel_for(\"mapPowersOfTwo\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    mask(i) = isPowerOfTwo(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n\t#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\n#pragma omp parallel for\n\tfor (size_t i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "serial",
+        "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004556026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044932238
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045381802,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045427108
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045741478,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045383504
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045431338,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056581278
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045641974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004516902
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045362084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005665279
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045473958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056925826
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045164934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045399454
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045621752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045060158
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004562943,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045177978
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.resize(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045041482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004485579
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045239832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045282616
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tmask.reserve(x.size());\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045058956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005632191
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045155256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005654391
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045653196,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056479828
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t  mask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004530572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.005621038
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045618846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045311392
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tmask.clear();\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask.push_back(isPowerOfTwo(x[i]));\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045271332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0056112976
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045343188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045435968
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044906648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045228034
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "hip",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = get_global_id(0);\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            ", 0, 9, 7, 15, 64, 3]",
+            ", 0, 9, 7, 15, 64, 3]",
+            "\n\n\tint idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint chunk_size = (n + size - 1) / size;\n\tint start = rank * chunk_size;\n\tint end = std::min(start + chunk_size, n);\n\n\tstd::vector<bool> local_mask(end - start);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), end - start, MPI_BOOL, mask.data(), end - start, MPI_BOOL, 0);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\tfor (int i = rank; i < n; i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n / size, MPI_BOOL, mask.data(), n / size, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = start + local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tmask.resize(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), mask.size(), MPI_C_BOOL, nullptr, 0, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\tif (rank == size - 1) {\n\t\tend = n;\n\t}\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_C_BOOL, mask.data(), local_n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tstd::vector<bool> localMask(x.size());\n\tfor (size_t i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tif (rank == 0) {\n\t\tmask = localMask;\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(&recvMask[0], x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (size_t j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] = mask[j] || recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(&localMask[0], x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the result\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), mask.size(), MPI_BOOL, mask.data(), mask.size(), MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t}\n\telse {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\tfor (size_t i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tint local_n = n / size;\n\tint start = rank * local_n;\n\tint end = (rank + 1) * local_n;\n\n\tstd::vector<bool> local_mask(local_n);\n\tfor (int i = start; i < end; i++) {\n\t\tlocal_mask[i - start] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), local_n, MPI_BOOL, mask.data(), local_n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "cuda",
+        "prompt": "__device__ bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\n__global__ void mapPowersOfTwo(const int *x, bool *mask, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = tid + bid * blockDim.x;\n\n\twhile (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t\ti += blockDim.x * gridDim.x;\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tmask[tid] = isPowerOfTwo(x[tid]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n",
+            "\n\n\tint idx = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (idx < N) {\n\t\tmask[idx] = isPowerOfTwo(x[idx]);\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "59_transform_map_function",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nbool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored in mask on rank 0.\n   Example:\n\n   input: [8, 0, 9, 7, 15, 64, 3]\n   output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector<int> const& x, std::vector<bool> &mask) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> local_mask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the local masks into the final mask on rank 0\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t}\n\tMPI_Gather(local_mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < static_cast<int>(x.size()); ++i) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * x.size() / size], x.size() / size, MPI_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * x.size() / size], x.size() / size, MPI_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own mask\n\tstd::vector<bool> localMask(x.size());\n\t#pragma omp parallel for\n\tfor (int i = 0; i < x.size(); i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Rank 0 gathers all the local masks\n\tif (rank == 0) {\n\t\tmask.resize(x.size());\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tstd::vector<bool> recvMask(x.size());\n\t\t\tMPI_Recv(recvMask.data(), x.size(), MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t\tfor (int j = 0; j < x.size(); j++) {\n\t\t\t\tmask[j] |= recvMask[j];\n\t\t\t}\n\t\t}\n\t} else {\n\t\tMPI_Send(localMask.data(), x.size(), MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint n = x.size();\n\tmask.resize(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(mask.data(), n, MPI_BOOL, mask.data(), n, MPI_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its own part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its local part of the mask\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> local_mask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocal_mask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(local_mask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tMPI_Gather(mask.data(), x.size(), MPI_C_BOOL, mask.data(), x.size(), MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n\tint rank, size;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &size);\n\n\t// Each rank computes its portion of the result\n\t#pragma omp parallel for\n\tfor (int i = rank; i < x.size(); i += size) {\n\t\tmask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\t// Gather the results on rank 0\n\tif (rank == 0) {\n\t\tfor (int i = 1; i < size; i++) {\n\t\t\tMPI_Recv(&mask[i * mask.size() / size], mask.size() / size, MPI_C_BOOL, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n\t\t}\n\t} else {\n\t\tMPI_Send(&mask[rank * mask.size() / size], mask.size() / size, MPI_C_BOOL, 0, 0, MPI_COMM_WORLD);\n\t}\n}\n",
+            "\n\n\tint rank, nprocs;\n\tMPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\tMPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n\tint n = x.size();\n\tstd::vector<bool> localMask(n);\n\n\t#pragma omp parallel for\n\tfor (int i = 0; i < n; i++) {\n\t\tlocalMask[i] = isPowerOfTwo(x[i]);\n\t}\n\n\tMPI_Gather(localMask.data(), n, MPI_C_BOOL, mask.data(), n, MPI_C_BOOL, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"oneMinusInverse\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = 1 - 1 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = 1.0 - 1.0 / x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001636175,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00163136
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016485482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016488288
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &element : x) {\n    element = 1 - 1 / element;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001662723,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016443724
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016296186,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016434026
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001643509,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016427336
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016693194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016477848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016524774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00162287
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016269578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001651071
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016102504,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016452422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016385958,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016281442
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016466566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016529084
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001629883,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016377278
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016319048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001647256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016455848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016441502
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    xi = 1 - 1 / xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016510752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001636169
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &val : x) {\n    val = 1 - 1 / val;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016300592,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00162608
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &i : x) {\n    i = 1 - 1 / i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001648398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016365602
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016424128,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016479334
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016252666,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016267774
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0016121922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016327742
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    x[gid] = 1 - 1 / x[gid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = 1 - 1 / x[i];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE,\n             0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, gathered_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size, MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, result.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = 1 - 1 / x[i];\n    }\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(),\n              MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, NULL, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the inverse of its elements\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[0], x.size() / size, MPI_DOUBLE, &x[0], x.size() / size,\n             MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (size_t i = rank; i < x.size(); i += size) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of the vector x with 1-1/x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\n__global__ void oneMinusInverse(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1.0 - 1.0 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = 1 - 1 / x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = 1.0 - 1.0 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) { x[tid] = 1.0 - 1.0 / x[tid]; }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = 1 - 1 / x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "57_transform_inverse_offset",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of the vector x with 1-1/x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [2, 4, 1, 12, -2]\n   output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int local_size = x.size() / num_procs;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  if (rank == num_procs - 1) {\n    end = x.size();\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Reduce(x.data(), x.data(), x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank == size - 1) ? n : start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = (i == size - 1) ? n : start + chunk_size;\n      MPI_Recv(&x[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_result(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_result.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  std::vector<double> global_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, global_x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = global_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  const int n = x.size();\n  const int chunk = (n + size - 1) / size;\n  const int start = rank * chunk;\n  const int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] = 1 - 1 / x[i];\n  }\n\n  MPI_Gather(&x[start], chunk, MPI_DOUBLE, &x[0], chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(Kokkos::View<double*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"relu\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (x(i) < 0) {\n      x(i) = 0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027294886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227181886
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027752246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022773615
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027401766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226906126
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002727625,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227546592
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027750182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227367656
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027453686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0216994516
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027885694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022308744
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027956812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0225820004
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027877662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022772647
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027972842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022686601
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027760944,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218693408
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027769038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226486418
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028155566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227036292
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027999474,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.022618028
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00279839,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226248068
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028011392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227768272
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027517506,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226173708
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027281582,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0227041684
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double &xi : x) {\n    if (xi < 0) {\n      xi = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027924192,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0226757508
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027864216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218904906
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            ": [-1.8, 24.0, 1.2, 0.0, 5.1, 0.2, 4.5]",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    std::vector<double> recv_buf(x.size());\n    for (int i = 1; i < MPI_COMM_WORLD.size(); i++) {\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<double> temp(x.size());\n      MPI_Recv(temp.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], end - start, MPI_DOUBLE, &x[0], end - start, MPI_DOUBLE, 0,\n            MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\n__global__ void relu(double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(x[tid], 0.0);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = fmax(0, x[tid]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = fmax(0.0, x[idx]);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    if (x[tid] < 0) {\n      x[tid] = 0;\n    }\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] < 0 ? 0 : x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] > 0 ? x[idx] : 0;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = fmax(0, x[idx]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int index = blockIdx.x * blockDim.x + threadIdx.x;\n  if (index < N) {\n    x[index] = fmax(0, x[index]);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    if (x[idx] < 0) {\n      x[idx] = 0;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadId.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid] = x[tid] > 0 ? x[tid] : 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = fmax(0.0, x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "55_transform_relu",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n   while elements greater than zero stay the same.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n   output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector<double> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma opm parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Reduce(&x[0], &x[0], x.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int chunk_size = n / nprocs;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == nprocs - 1) {\n    end = n;\n  }\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, local_size, MPI_DOUBLE, x.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start_index = rank * local_size;\n  int end_index = start_index + local_size;\n\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start_index], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n  if (rank == size - 1) {\n    end = x.size();\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(x.size());\n      MPI_Recv(recv_buf.data(), x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n  MPI_Gather(&x[start], chunk_size, MPI_DOUBLE, &x[0], chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_DOUBLE, &x[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its portion of x\n  #pragma omp parallel for\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * x.size() / size], x.size() / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * x.size() / size], x.size() / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the ReLU function on its own copy of x\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Rank 0 has the final result\n  if (rank == 0) {\n    x.resize(x.size() * size);\n    MPI_Gatherv(nullptr, 0, MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the ReLU function on every element of x.\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf, x.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv_buf.begin(), recv_buf.end());\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(x.data() + start, chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the ReLU function on every element of x\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<double> recv_buf(n);\n      MPI_Recv(recv_buf.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] < 0) {\n      x[i] = 0;\n    }\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_DOUBLE, &x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Replace every element of x with the square of its value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.size(), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) * x(i);\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, x.extent(0)), KOK_LAMBDA(int i) {\n    x[i] = x[i] * x[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"squareEach\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) *= x(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n  #pragma simd\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = x[i] * x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= x[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "serial",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010628602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001002254
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010686068,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010238248
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010766762,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010263592
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010818218,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010191702
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011028974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001044349
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010894542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010290908
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010782694,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010220174
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0011139404,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010334346
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001083485,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010497594
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010892778,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010306052
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010758808,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010091152
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010898528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010256522
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010892436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010274456
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010751634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009853302
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010621948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0009940428
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010839154,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010070894
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &xi : x) {\n    xi = xi * xi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001069613,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010170598
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010691862,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001002344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int &i : x) {\n    i *= i;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0010877854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0010302326
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "hip",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_Comm_world, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_Comm_world, &size);\n\n  // Each process squares its own elements\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] = x[i] * x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_Comm_world);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank squares its elements\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the squared elements\n  std::vector<int> squared;\n  if (rank == 0) {\n    squared.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Rank 0 stores the final result\n  if (rank == 0) {\n    x = squared;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (int i = rank; i < x.size(); i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), x.size() / size, MPI_INT, x.data(), x.size() / size,\n             MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Square each element of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather all the squared elements on rank 0\n  std::vector<int> squared_x;\n  if (rank == 0) {\n    squared_x.resize(x.size() * MPI::COMM_WORLD.Get_size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Copy the squared elements back to x on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = x[i] * x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(x.size());\n      MPI_Recv(&temp[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "(",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n / size], n / size, MPI_INT, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * n / size], n / size, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(x.data(), n / size, MPI_INT, x.data(), n / size, MPI_INT, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> tmp(n);\n      MPI_Recv(tmp.data(), n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  for (int i = rank; i < n; i += size) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[0], n / size, MPI_INT, &x[0], n / size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(x.size());\n      MPI_Recv(&recv[0], x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "cuda",
+        "prompt": "/* Replace every element of x with the square of its value.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\n__global__ void squareEach(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = thread_idx_1;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] = x[tid] * x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] = x[idx] * x[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    x[idx] *= x[idx];\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  if (tid < N) {\n    x[tid] *= x[tid];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "transform",
+        "language": "cpp",
+        "name": "58_transform_squaring",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Replace every element of x with the square of its value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [5, 1, 2, -4, 8]\n   output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = start + chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, chunk, MPI_INT, x.data(), chunk, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int chunk = (n + size - 1) / size;\n  int start = rank * chunk;\n  int end = std::min(start + chunk, n);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, end - start, MPI_INT, x.data(), end - start, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int recv_buf[n];\n      MPI_Recv(&recv_buf[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank squares its own elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> temp(n);\n      MPI_Recv(&temp[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += temp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank squares its elements\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < MPI_COMM_WORLD; i++) {\n      std::vector<int> recv;\n      MPI_Recv(&recv, x.size(), MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank != 0) {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(n);\n      MPI_Recv(&recv[0], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        x[j] += recv[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Square each element of x in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] *= x[i];\n  }\n\n  // Gather the squared elements from all ranks to rank 0\n  std::vector<int> squared_x(x.size());\n  MPI_Gather(x.data(), x.size(), MPI_INT, squared_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = squared_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] *= x[i];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * n], n, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], n, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_n, MPI_INT, &x[0], local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = x.size() / size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == size - 1) {\n    end = x.size();\n  }\n#pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n  MPI_Gather(&x[start], local_size, MPI_INT, &x[0], local_size, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i] *= x[i];\n  }\n\n  MPI_Gather(x.data() + start, local_n, MPI_INT, x.data(), local_n, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use Kokkos to parallelize. Assume Kokkos has already been initialized.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, Kokkos::View<const COOElement*> const& A, Kokkos::View<const double*> const& x, double beta, Kokkos::View<double *> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j < A.extent(i+1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(i); j > 0; --j) {\n         y(i) += alpha * A(i, j - 1).value * x(A(i, j - 1).column);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   auto spvmv_lambda = KOKKOS_LAMBDA (const COOElement *a, double *y) {\n      y[a->row] += a->value * alpha * x[a->column];\n   };\n\n   KOKKOS_PARALLEL_EXEC(KOKKOS_RANGE_POLICY(0, A.size), spvmv_lambda, A.data, y);\n\n   KOKKOS_PARALLEL_FOR(KOKKOS_RANGE_POLICY(0, M), KOKKOS_LAMBDA(size_t i) {\n      y[i] = y[i] * alpha + x[i] * alpha;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", M, KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.extent(0); j--; ) {\n         if (A(j).row == i) {\n            y(i) += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * x(A(j).column);\n         }\n      }\n      y(i) = alpha * sum + beta * y(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i, double &y_i) {\n      y_i = beta * y_i;\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            y_i += alpha * A(j).value * x(A(j).column);\n         }\n      }\n   }, y);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      y(i) = beta * y(i);\n      for (size_t j = A.row_map(i); j < A.row_map(i + 1); ++j) {\n         y(i) += alpha * A(j).value * x(A(j).column);\n      }\n   });\n}\n",
+            "\n\n   Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n      y[i] = alpha * Kok\u30b3\u30af::reduce(Kok\u30b3\u30af::Range(0, A.size()), 0.0, [&](size_t j, double sum) {\n         if (A[j].row == i) {\n            return sum + A[j].value * x[A[j].column];\n         }\n         return sum;\n      }) + uyi;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use OpenMP to parallelize.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        sum += A[j].value * x[A[j].column];\n      }\n    }\n    y[i] = alpha * sum + beta * y[i];\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    y[i] *= beta;\n    for (size_t j = 0; j < A.size(); j++) {\n      if (A[j].row == i) {\n        y[i] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n#pragma omp parallel for schedule(static)\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            y[i] += alpha * A[j].value * x[A[j].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n    for (auto const& element : A) {\n      if (element.row == i) {\n        y[i] += alpha * element.value * x[element.column];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] = y[i] * alpha + y[i] *beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& e : A) {\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& e : A) {\n      if (e.row < M && e.column < N) {\n         y[e.row] += alpha * e.value * x[e.column];\n      }\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use AMD HIP to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t tid = hipThreadIdx_x;\n      if (tid < M) {\n         atomicMul(&y[tid], beta);\n      }\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == tid) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[tid] = alpha * dot + \u3079\u305f * y[tid];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n   size_t i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      atomicAdd(&y[row], alpha * val * x[col]);\n   }\n   atomicMul(&y[hipThreadIdx_x], beta);\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = beta * y[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += alpha * A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = sum;\n   }\n}\n",
+            "\nfor (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {\n      double dot = 0.0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            dot += A[j].value * x[A[j].column];\n         }\n      }\n      y[i] = alpha * dot + \u3079\u305f * y[i];\n   }\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[i], beta);\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double dot = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            dot += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * dot + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t tid = hipThreadIdx_x;\n   size_t block_size = hipBlockDim_x;\n   size_t block_id = hipBlockIdx_x;\n   size_t start = block_id * block_size + tid;\n   size_t end = min(start + block_size, sizeA);\n\n   for (size_t i = start; i < end; i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n\n   y[tid] *= beta;\n}\n",
+            "\n\n   size_t row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * M], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[rank * M], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  size_t elements_per_rank = A.size() / size;\n\n  // Compute the starting index for this rank\n  size_t start_index = rank * elements_per_rank;\n\n  // Compute the ending index for this rank\n  size_t end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = beta * y[i];\n  }\n\n  // Compute the local portion of A*x\n  for (size_t i = start_index; i < end_index; ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its local contribution to y\n   for (size_t i = 0; i < A.size(); ++i) {\n      if (A[i].row == rank) {\n         y[A[i].row] += alpha * A[i].value * x[A[i].column];\n      }\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (M + size - 1) / size;\n\n  // Compute the start and end rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, M);\n\n  // Compute the local contribution to y\n  std::vector<double> local_y(M, 0.0);\n  for (auto const& element : A) {\n    if (element.row >= start_row && element.row < end_row) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n    }\n  }\n\n  // Reduce the local contributions to y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            ":",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the size of the local part of y\n   size_t local_M = M / MPI::COMM_WORLD.Get_size();\n   size_t start_row = local_M * MPI::COMM_WORLD.Get_rank();\n   size_t end_row = start_row + local_M;\n\n   // Compute the local part of y\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Gather the local parts of y to rank 0\n   MPI::COMM_WORLD.Gather(&y[start_row], local_M, MPI::DOUBLE, &y[0], local_M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   for (auto const& element : A) {\n      if (element.row < M / size * rank || element.row >= M / size * (rank + 1)) {\n         continue;\n      }\n      partial_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   for (COOElement const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n}\n",
+            "\n\n   // Compute the local portion of the result\n   for (auto const& element : A) {\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Scale the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (auto& value : y) {\n         value *= beta;\n      }\n   }\n}\n",
+            "\n\n   // COO format:\n   // A[i] = {row, column, value}\n   // where A[i] is the i-th non-zero element in the matrix\n\n   // y = alpha*A*x + y\n   std::iota(y.begin(), y.end(), 0);\n   std::transform(y.begin(), y.end(), x.begin(), y.begin(),\n      [alpha, &A](double y, double x) {\n         double sum = 0;\n         for (auto const& a : A) {\n            sum += a.value * (alpha * x);\n         }\n         return y + sum;\n      });\n\n   // y = y + (beta*y)\n   std::transform(y.begin(), y.end(), y.begin(),\n      [beta](double y) {\n         return y + (beta * y);\n      });\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (auto const& element : A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Add beta*y to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] += ampi * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local contributions to y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce the results to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to the result on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] = beta * y[i];\n      }\n   }\n}\n",
+            "\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / MPI::COMM_WORLD.Get_size();\n\n   // Compute the starting index for this rank\n   size_t start_index = num_elements_per_rank * MPI::COMM_WORLD.Get_rank();\n\n   // Compute the ending index for this rank\n   size_t end_index = start_index + num_elements_per_rank;\n\n   // Initialize the local portion of y to beta*y\n   std::vector<double> local_y(M, beta * y[MPI::COMM_WORLD.Get_rank()]);\n\n   // Perform the sparse matrix-vector multiplication for this rank\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI::COMM_WORLD.Reduce(local_y.data(), y.data(), M, MPI::DOUBLE, MPI::SUM, 0);\n}\n",
+            "\n\n   // Broadcast alpha, beta, M, and N to all ranks\n   MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&beta, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&M, 1, MPI_INT, 0, MPI_COMM_WORLD);\n   MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n   // Compute the number of elements per rank\n   int num_elements_per_rank = A.size() / MPI_COMM_SIZE;\n\n   // Scatter A to all ranks\n   std::vector<COOElement> local_A(num_elements_per_rank);\n   MPI_Scatter(A.data(), num_elements_per_rank, MPI_BYTE, local_A.data(), num_elements_per_rank, MPI_BYTE, 0, MPI_COMM_WORLD);\n\n   // Compute the local contribution to y\n   std::vector<double> local_y(M, 0.0);\n   for (const auto& element : local_A) {\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather the local contributions to y\n   std::vector<double> global_y(M);\n   MPI_Gather(local_y.data(), M, MPI_DOUBLE, global_y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y\n   if (beta != 0.0) {\n      for (auto& element : global_y) {\n         element += beta * element;\n      }\n   }\n\n   // Replace y with the global result\n   y = std::move(global_y);\n}\n",
+            "\n\n   // Initialize y on all ranks to beta*y\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute the local contribution of each rank to the result\n   for (auto const& element : A) {\n      if (element.row < M) {\n         y[element.row] += alpha * element.value * x[element.column];\n      }\n   }\n\n   // Reduce the local contributions to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format with sizeA elements.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use CUDA to parallelize. The kernel will be launched with at least sizeA threads.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\n__global__ void spmv(double alpha, const COOElement *A, size_t sizeA, const double *x, double beta, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[tid] *= beta;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      size_t row = A[gid].row;\n      size_t column = A[gid].column;\n      double value = A[gid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n\n   if (gid < M) {\n      atomicAdd(&y[gid], beta * y[gid]);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      atomicMul(&y[threadIdx.x], beta);\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      const COOElement &e = A[tid];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n      if (i < M) {\n         atomicMul(&y[i], beta);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[threadIdx.x], beta);\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   atomicMul(&y[blockIdx.x], beta);\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < sizeA) {\n    size_t row = A[tid].row;\n    size_t column = A[tid].column;\n    double value = A[tid].value;\n    atomicAdd(&y[row], alpha * value * x[column]);\n  }\n  if (beta != 1.0) {\n    y[threadIdx.x] *= beta;\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int gid = blockIdx.x * blockDim.x + threadIdx.x;\n\n   if (gid < sizeA) {\n      const COOElement &a = A[gid];\n      y[a.row] += alpha * a.value * x[a.column];\n   }\n\n   if (gid < M) {\n      y[gid] *= beta;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&y[row], alpha * value * x[column]);\n   }\n   if (beta != 1.0) {\n      y[threadIdx.x] *= beta;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      y[row] = alpha * sum + beta * y[row];\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[threadIdx.x] *= beta;\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      y[row] += alpha * value * x[column];\n   }\n   y[blockIdx.x] *= beta;\n}\n",
+            "{"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "47_sparse_la_spmv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n   and A is a sparse matrix stored in COO format.\n   A has dimensions MxN, x has N values, and y has M values.\n   Use MPI and OpenMP to parallelize. Assume MPI has already been initialized.\n   Every rank has a complete copy of A, x, and y. Store the result in y on rank 0.\n   Example:\n\n   input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n   output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector<COOElement> const& A, std::vector<double> const& x, double beta, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Each rank computes its own partial sum\n   std::vector<double> partial_y(M, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      partial_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n\n   // Reduce the partial sums to rank 0\n   MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Rank 0 scales the result by beta\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its portion of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      double sum = 0;\n      for (auto const& element : A) {\n         if (element.row == i) {\n            sum += element.value * x[element.column];\n         }\n      }\n      y[i] = alpha * sum + beta * y[i];\n   }\n\n   // Rank 0 gathers the results from all ranks\n   if (rank == 0) {\n      for (int i = 1; i < size; ++i) {\n         MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the sparse matrix that each rank is responsible for\n   size_t num_elements_per_rank = A.size() / size;\n\n   // Compute the starting index of the elements in the sparse matrix that each rank is responsible for\n   size_t start_index = rank * num_elements_per_rank;\n\n   // Compute the ending index of the elements in the sparse matrix that each rank is responsible for\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Compute the local portion of y that each rank is responsible for\n   std::vector<double> local_y(M);\n\n   // Compute the local portion of y that each rank is responsible for\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; i++) {\n      const COOElement& element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y into the global y vector on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; i++) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      y[e.row] += alpha * e.value * x[e.column];\n   }\n}\n",
+            "\n\n  // Initialize y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::fill(y.begin(), y.end(), 0.0);\n  }\n\n  // Parallelize over rows of A\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    COOElement const& e = A[i];\n    y[e.row] += alpha * e.value * x[e.column];\n  }\n\n  // Parallelize over elements of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = beta * y[i];\n  }\n\n  // Gather y from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(y.data(), M, MPI::DOUBLE, y.data(), M, MPI::DOUBLE, 0);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of elements each rank is responsible for\n   size_t elements_per_rank = A.size() / nprocs;\n\n   // Compute the starting index for this rank\n   size_t start_index = rank * elements_per_rank;\n\n   // Compute the ending index for this rank\n   size_t end_index = (rank + 1) * elements_per_rank;\n\n   // Compute the local portion of y\n   std::vector<double> local_y(M, 0.0);\n\n   // Perform the sparse matrix-vector multiplication in parallel\n   #pragma omp parallel for\n   for (size_t i = start_index; i < end_index; ++i) {\n      COOElement element = A[i];\n      local_y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Reduce the local portions of y to the global y on rank 0\n   MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Apply beta to y on rank 0\n   if (rank == 0) {\n      for (size_t i = 0; i < M; ++i) {\n         y[i] *= beta;\n      }\n   }\n}\n",
+            "\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    local_y[A[i].row] += alpha * A[i].value * x[A[i].column];\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Apply beta to y on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (size_t i = 0; i < M; ++i) {\n      y[i] = beta * y[i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      y[A[i].row] += alpha * A[i].value * x[A[i].column];\n   }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per rank\n  size_t num_rows_per_rank = (M + size - 1) / size;\n\n  // Calculate the starting and ending rows for the current rank\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, M);\n\n  // Initialize the local portion of y to beta*y\n  std::vector<double> local_y(num_rows_per_rank, beta * y[rank]);\n\n  // OpenMP parallelization\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < A.size(); ++j) {\n      if (A[j].row == i) {\n        local_y[i - start_row] += alpha * A[j].value * x[A[j].column];\n      }\n    }\n  }\n\n  // Gather the local portions of y into the global y on rank 0\n  MPI_Gather(local_y.data(), num_rows_per_rank, MPI_DOUBLE, y.data(), num_rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Initialize y to beta*y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] *= beta;\n  }\n\n  // Perform the sparse matrix-vector multiplication\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    size_t row = A[i].row;\n    size_t column = A[i].column;\n    double value = A[i].value;\n    y[row] += alpha * value * x[column];\n  }\n\n  // Reduce y to rank 0\n  MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of rows per rank\n   size_t num_rows_per_rank = (M + MPI_Comm_size(MPI_COMM_WORLD) - 1) / MPI_Comm_size(MPI_COMM_WORLD);\n\n   // Compute the starting row for this rank\n   size_t start_row = num_rows_per_rank * MPI_Comm_rank(MPI_COMM_WORLD);\n\n   // Compute the ending row for this rank\n   size_t end_row = std::min(num_rows_per_rank * (MPI_Comm_rank(MPI_COMM_WORLD) + 1), M);\n\n   // Initialize the local portion of y to beta*y\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; ++i) {\n      y[i] = beta * y[i];\n   }\n\n   // Perform the sparse matrix-vector multiplication\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n\n      if (row >= start_row && row < end_row) {\n         y[row] += alpha * value * x[column];\n      }\n   }\n\n   // Reduce the results from all ranks to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(MPI_IN_PLACE, y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / nprocs; i < (rank + 1) * M / nprocs; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local contribution to y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double local_y = beta * y[i];\n    for (auto const& element : A) {\n      if (element.row == i) {\n        local_y += alpha * element.value * x[element.column];\n      }\n    }\n    y[i] = local_y;\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&y[0], M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&y[0], M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& element = A[i];\n      y[element.row] += alpha * element.value * x[element.column];\n   }\n\n   // Gather y from all ranks to rank 0\n   MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute local portion of y\n   #pragma omp parallel for\n   for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   // Reduce y to rank 0\n   MPI_Reduce(y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_M = M / size;\n   size_t start_row = rank * local_M;\n   size_t end_row = start_row + local_M;\n\n   #pragma omp parallel for\n   for (size_t i = start_row; i < end_row; i++) {\n      y[i] = beta * y[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            y[i] += alpha * element.value * x[element.column];\n         }\n      }\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&y[i * local_M], local_M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&y[start_row], local_M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize y to beta*y\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      y[i] *= beta;\n   }\n\n   // Compute y += alpha*A*x\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      y[row] += alpha * value * x[column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(Kokkos::View<const COOElement*> &A, Kokkos::View<const COOElement*> &X, Kokkos::View<double**> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::Range<size_t>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.extent(0); ++l) {\n               if (A(l).row == i && A(l).column == k) {\n                  for (size_t m = 0; m < X.extent(0); ++m) {\n                     if (X(m).row == k && X(m).column == j) {\n                        sum += A(l).value * X(m).value;\n                     }\n                  }\n               }\n            }\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel reduction to sum up the sparse matrix multiplication.\n   auto policy = KOKKOS_RANGE_POLICY(0, M);\n   auto sum_lambda = KOKKOS_LAMBDA(const size_t i, double** sum) {\n      for (size_t j = 0; j < N; ++j) {\n         sum[i][j] = 0.0;\n      }\n   };\n   auto reduce_lambda = KOKKOS_LAMBDA(const size_t i, const size_t j, const COOElement a, const COOElement b, double** sum) {\n      sum[a.row][j] += a.value * b.value;\n   };\n   auto view = KokKOS::parallel_reduce(\n      \"Sparse matrix multiplication\", policy, sum_lambda, reduce_lambda, A, X);\n   view.wait();\n\n   // Copy the result to the output matrix.\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = view(i, j);\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A(i, k).value * X(k, j).value;\n         }\n         Y(i, j) = sum;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int &i) {\n      for (size_t j = 0; j < N; j++) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; k++) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            ".",
+            "\n\n   Kokkos::parallel_for(\"spmm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y(i, j) = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y(i, j) += A(i, k).value * X(k, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::Range(0, M), [&](size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i][j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        Y[i][j] += A[i][k].value * X[k][j].value;\n      }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                Y[i * N + j] += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A[k].value * X[k].value;\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Y[i * N + j] = 0;\n      for (size_t k = 0; k < K; k++) {\n        Y[i * N + j] += A[k].value * X[k].value;\n      }\n    }\n  }\n}\n",
+            ",",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (size_t l = 0; l < A.size(); ++l) {\n          if (A[l].row == i && A[l].column == k) {\n            for (size_t m = 0; m < X.size(); ++m) {\n              if (X[m].row == k && X[m].column == j) {\n                sum += A[l].value * X[m].value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        sum += A[l].value * X[m].value;\n                        break;\n                     }\n                  }\n                  break;\n               }\n            }\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        for (auto const& a : A) {\n          if (a.row == i && a.column == k) {\n            for (auto const& x : X) {\n              if (x.row == k && x.column == j) {\n                sum += a.value * x.value;\n              }\n            }\n          }\n        }\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[k].value * X[k].value;\n      }\n      Y[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         Y[i * N + j] = 0.0;\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         Y[i * N + j] = 0;\n         for (size_t k = 0; k < K; k++) {\n            for (size_t l = 0; l < A.size(); l++) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); m++) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   for (auto const& e : X) {\n      if (A_map.count(e.row) > 0) {\n         for (auto const& [column, value] : A_map[e.row]) {\n            Y[e.column * M + column] += e.value * value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::vector<double>> row_map;\n   for (auto const& element : A) {\n      row_map[element.row].push_back(element.value);\n   }\n   for (auto const& element : X) {\n      if (row_map.count(element.column)) {\n         for (size_t i = 0; i < row_map[element.column].size(); ++i) {\n            Y[element.row * N + i] += row_map[element.column][i] * element.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1083108606,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192031936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1086381846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019037975
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   for (auto& e : A) {\n      for (auto& f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1075040644,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192689414
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M*N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.108069242,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190348272
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1075112764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0188581126
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = dot;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M*N);\n   std::fill(Y.begin(), Y.end(), 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1075415412,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190659156
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.assign(M * N, 0);\n   for (COOElement e : A) {\n      for (COOElement f : X) {\n         if (e.column == f.row) {\n            Y[e.row * N + f.column] += e.value * f.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0776226934,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190779444
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1082792552,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.019178872
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1081398228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0190503742
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < K; ++k) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_row_major;\n   for (auto const& e : A) {\n      A_row_major[e.row][e.column] = e.value;\n   }\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_col_major;\n   for (auto const& e : X) {\n      X_col_major[e.column][e.row] = e.value;\n   }\n\n   Y.resize(M * N);\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; ++k) {\n            dot_product += A_row_major[i][k] * X_col_major[j][k];\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1079833002,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0135376482
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   Y.resize(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1082767846,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192688354
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> A_map;\n   for (auto const& e : A) {\n      A_map[e.row][e.column] = e.value;\n   }\n   std::unordered_map<size_t, std::unordered_map<size_t, double>> X_map;\n   for (auto const& e : X) {\n      X_map[e.row][e.column] = e.value;\n   }\n   for (size_t i = 0; i < M; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < K; k++) {\n            sum += A_map[i][k] * X_map[k][j];\n         }\n         Y[i * N + j] = sum;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   int i = tid + bid * bdim;\n   if (i < sizeA) {\n      size_t rowA = A[i].row;\n      size_t colA = A[i].column;\n      double valA = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == colA) {\n            Y[rowA * N + X[j].row] += valA * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipGridDim_x;\n  int tid_per_block = hipBlockDim_x;\n\n  for (int i = tid; i < sizeA; i += tid_per_block * bid_size) {\n    int row = A[i].row;\n    int col = A[i].column;\n    double val = A[i].value;\n    for (int j = 0; j < sizeX; j++) {\n      if (X[j].column == col) {\n        Y[row * N + X[j].row] += val * X[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column < K) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x * bid_size;\n\n   for (size_t i = tid; i < sizeA; i += num_threads) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < M && col < N) {\n      Y[row * N + col] = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t a = 0; a < sizeA; ++a) {\n            if (A[a].row == row && A[a].column == k) {\n               for (size_t x = 0; x < sizeX; ++x) {\n                  if (X[x].row == k && X[x].column == col) {\n                     Y[row * N + col] += A[a].value * X[x].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].column) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bdim = blockDim.x;\n   int gdim = gridDim.x;\n   for (int i = tid + bid * bdim; i < sizeA; i += gdim * bdim) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double val = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         if (X[j].column == col) {\n            Y[row * N + X[j].row] += val * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      int rowA = A[i].row;\n      int colA = A[i].column;\n      double valA = A[i].value;\n      for (int j = 0; j < sizeX; j++) {\n         int rowX = X[j].row;\n         int colX = X[j].column;\n         double valX = X[j].value;\n         if (colA == rowX) {\n            atomicAdd(&Y[rowA * N + colX], valA * valX);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n\n   for (size_t i = tid; i < sizeA; i += num_threads * bid_size) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < sizeX; j++) {\n         if (X[j].column == column) {\n            Y[row * N + X[j].row] += value * X[j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> recv;\n         MPI::COMM_WORLD.Recv(recv, M * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < M * N; ++j) {\n            Y[j] += recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(Y.data(), M * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Every rank computes its local contribution to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the local contributions to Y on rank 0.\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Gather(localY.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ",",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the local portion of Y\n   std::vector<double> localY(M*N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row*N + x.column] += a.value * x.value;\n         }\n      }\n   }\n   \n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M*N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its own part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in each row of A and X\n   std::vector<size_t> A_row_counts(M, 0);\n   std::vector<size_t> X_row_counts(K, 0);\n   for (auto const& element : A) {\n      A_row_counts[element.row]++;\n   }\n   for (auto const& element : X) {\n      X_row_counts[element.row]++;\n   }\n\n   // Compute the offsets for each row of A and X\n   std::vector<size_t> A_offsets(M, 0);\n   std::vector<size_t> X_offsets(K, 0);\n   for (size_t i = 1; i < M; i++) {\n      A_offsets[i] = A_offsets[i - 1] + A_row_counts[i - 1];\n   }\n   for (size_t i = 1; i < K; i++) {\n      X_offsets[i] = X_offsets[i - 1] + X_row_counts[i - 1];\n   }\n\n   // Create a buffer to store the local results\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local results\n   for (auto const& element : A) {\n      for (size_t j = X_offsets[element.column]; j < X_offsets[element.column] + X_row_counts[element.column]; j++) {\n         local_Y[element.row * N + X[j].column] += element.value * X[j].value;\n      }\n   }\n\n   // Reduce the local results to the global result\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Compute the number of elements in the resulting matrix\n   size_t num_elements = M * N;\n\n   // Initialize the result matrix to zero\n   Y.resize(num_elements, 0.0);\n\n   // Scatter the result matrix to all ranks\n   MPI_Scatter(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute the local part of the result matrix\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local results to the result matrix on rank 0\n   MPI_Reduce(Y.data(), num_elements, MPI_DOUBLE, Y.data(), num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   for (size_t i = rank; i < M; i += size) {\n      for (size_t j = 0; j < N; j++) {\n         double dot_product = 0;\n         for (size_t k = 0; k < K; k++) {\n            dot_product += A[i * K + k].value * X[k * N + j].value;\n         }\n         Y[i * N + j] = dot_product;\n      }\n   }\n\n   // Gather the results on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<double> recv_buffer(M * N);\n         MPI_Recv(recv_buffer.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < M * N; j++) {\n            Y[j] += recv_buffer[j];\n         }\n      }\n   } else {\n      MPI_Send(Y.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.assign(M * N, 0);\n\n   // Compute the partial sum on each rank\n   std::vector<double> partial_sum(M * N, 0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            partial_sum[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_sum(M * N, 0);\n   MPI_Allreduce(partial_sum.data(), global_sum.data(), M * N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Store the result on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      Y = global_sum;\n   }\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& a = A[i];\n      for (size_t j = 0; j < X.size(); j++) {\n         const COOElement& x = X[j];\n         if (a.column == x.row) {\n            Y[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y.\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0.\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   for (auto const& a : A) {\n      for (auto const& x : X) {\n         if (a.column == x.row) {\n            localY[a.row * N + x.column] += a.value * x.value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format with sizeA elements.\n   X is a sparse KxN matrix in COO format with sizeX elements. Y is a dense MxN matrix in row-major.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\n__global__ void spmm(const COOElement *A, size_t sizeA, const COOElement *X, size_t sizeX, double *Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= M) return;\n   for (int i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (int j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + X[j].column] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[k].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < K; k++) {\n         for (size_t j = 0; j < sizeA; j++) {\n            if (A[j].row == row && A[j].column == k) {\n               for (size_t l = 0; l < sizeX; l++) {\n                  if (X[l].row == k && X[l].column == i) {\n                     Y[row * N + i] += A[j].value * X[l].value;\n                  }\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tx = threadIdx.x;\n   int bx = blockIdx.x;\n   int tid = tx + bx * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t i = 0; i < sizeX; i++) {\n         if (X[i].column == column) {\n            Y[row * N + X[i].row] += value * X[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < N; i++) {\n         Y[row * N + i] = 0;\n      }\n      for (size_t k = 0; k < sizeA; k++) {\n         if (A[k].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (X[j].column == A[k].column) {\n                  Y[row * N + X[j].row] += A[k].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row && X[j].column == col) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (X[j].column == k) {\n               Y[row * N + X[j].row] += value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t k = 0; k < K; ++k) {\n         double sum = 0;\n         for (size_t i = 0; i < sizeA; ++i) {\n            if (A[i].row == row && A[i].column == k) {\n               for (size_t j = 0; j < sizeX; ++j) {\n                  if (X[j].row == k && X[j].column == j) {\n                     sum += A[i].value * X[j].value;\n                  }\n               }\n            }\n         }\n         Y[row * N + k] = sum;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   Y[row * N + col] = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; j++) {\n            if (A[i].column == X[j].row) {\n               Y[row * N + col] += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t rowA = A[blockIdx.x].row;\n   size_t columnA = A[blockIdx.x].column;\n   double valueA = A[blockIdx.x].value;\n\n   for (size_t i = 0; i < N; i++) {\n      if (columnA == X[threadIdx.x].row) {\n         Y[rowA * N + i] += valueA * X[threadIdx.x].value;\n      }\n   }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  double sum = 0;\n  for (size_t i = 0; i < sizeA; i++) {\n    if (A[i].row == row) {\n      for (size_t j = 0; j < sizeX; j++) {\n        if (A[i].column == X[j].row) {\n          sum += A[i].value * X[j].value;\n        }\n      }\n    }\n  }\n  Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; j++) {\n               if (A[i].column == X[j].row) {\n                  atomicAdd(&Y[row * N + X[j].column], A[i].value * X[j].value);\n               }\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; ++i) {\n      if (A[i].row == row) {\n         for (size_t j = 0; j < sizeX; ++j) {\n            if (A[i].column == X[j].row) {\n               sum += A[i].value * X[j].value;\n            }\n         }\n      }\n   }\n   Y[row * N + col] = sum;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < M) {\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < sizeX; ++j) {\n               if (A[i].column == X[j].row) {\n                  Y[row * N + X[j].column] += A[i].value * X[j].value;\n               }\n            }\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "46_sparse_la_spmm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n   X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and X. Store the result in Y on rank 0.\n   Example:\n\n   input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n   output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector<COOElement> const& A, std::vector<COOElement> const& X, std::vector<double> &Y, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (X[j].row == column) {\n            localY[row * N + X[j].column] += value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   if (rank == 0) {\n      Y.resize(M * N);\n      MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   } else {\n      MPI_Reduce(localY.data(), nullptr, M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            ",",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            localY[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "_t,",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            Y[i * N + j] += A[k].value * X[k].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local row range for this rank\n   size_t local_m = M / nprocs;\n   size_t start_row = rank * local_m;\n   size_t end_row = (rank == nprocs - 1) ? M : start_row + local_m;\n\n   // Initialize the local part of Y to zero\n   std::vector<double> local_Y(local_m * N, 0.0);\n\n   // Perform the matrix multiplication in parallel\n   #pragma omp parallel for collapse(2)\n   for (size_t i = start_row; i < end_row; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < K; ++k) {\n            local_Y[i * N + j] += A[i * K + k].value * X[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the local results into Y on rank 0\n   MPI_Gather(local_Y.data(), local_m * N, MPI_DOUBLE, Y.data(), local_m * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> local_Y(M * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            local_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the number of rows of A and X owned by each rank\n   size_t nrows_A = A.size() / nprocs;\n   size_t nrows_X = X.size() / nprocs;\n\n   // Compute the starting row index of A and X owned by each rank\n   size_t start_row_A = rank * nrows_A;\n   size_t start_row_X = rank * nrows_X;\n\n   // Compute the local portion of Y owned by each rank\n   std::vector<double> local_Y(M * N, 0.0);\n\n   // Compute the local portion of Y owned by each rank\n   #pragma omp parallel for\n   for (size_t i = 0; i < nrows_A; i++) {\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < K; k++) {\n            if (A[start_row_A + i].column == X[start_row_X + k].row) {\n               local_Y[A[start_row_A + i].row * N + j] += A[start_row_A + i].value * X[start_row_X + k].value;\n            }\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(local_Y.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of Y\n  std::vector<double> localY(M * N, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Reduce the local portions of Y to rank 0\n  MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix-matrix product in parallel\n  #pragma mpi_for_schedule(static)\n  for (size_t i = 0; i < A.size(); ++i) {\n    const COOElement& a = A[i];\n    for (size_t j = 0; j < X.size(); ++j) {\n      const COOElement& x = X[j];\n      if (a.column == x.row) {\n        #pragma openmp atomic\n        Y[a.row * N + x.column] += a.value * x.value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, Y.size(), MPI_DOUBLE, Y.data(), Y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the partial sum for each row of Y on each rank\n   std::vector<double> partial_Y(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            partial_Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the partial sums to rank 0\n   std::vector<double> global_Y(M * N, 0.0);\n   MPI_Reduce(partial_Y.data(), global_Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Store the result in Y on rank 0\n   if (rank == 0) {\n      Y = global_Y;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local contribution of this rank to Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      for (size_t j = 0; j < X.size(); ++j) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local contributions to Y on rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize Y to zero\n  Y.resize(M * N, 0.0);\n\n  // Compute the matrix multiplication in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); ++i) {\n    for (size_t j = 0; j < X.size(); ++j) {\n      if (A[i].column == X[j].row) {\n        Y[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(Y.data(), M * N, MPI_DOUBLE, Y.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize Y to zero\n   Y.resize(M * N, 0.0);\n\n   // Compute the matrix multiplication in parallel using MPI and OpenMP\n   #pragma omp parallel for\n   for (size_t i = 0; i < M; ++i) {\n      for (size_t k = 0; k < K; ++k) {\n         for (size_t j = 0; j < N; ++j) {\n            for (size_t l = 0; l < A.size(); ++l) {\n               if (A[l].row == i && A[l].column == k) {\n                  for (size_t m = 0; m < X.size(); ++m) {\n                     if (X[m].row == k && X[m].column == j) {\n                        Y[i * N + j] += A[l].value * X[m].value;\n                     }\n                  }\n               }\n            }\n         }\n      }\n   }\n\n   // Reduce the result to rank 0\n   MPI_Reduce(MPI_IN_PLACE, Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Compute the local portion of Y\n   std::vector<double> localY(M * N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      for (size_t j = 0; j < X.size(); j++) {\n         if (A[i].column == X[j].row) {\n            localY[A[i].row * N + X[j].column] += A[i].value * X[j].value;\n         }\n      }\n   }\n\n   // Reduce the local portions of Y to rank 0\n   MPI_Reduce(localY.data(), Y.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(Kokkos::View<const COOElement*> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n\n   for (size_t iter = 0; iter < N; ++iter) {\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         r[e.row] -= e.value * z[e.column];\n      }\n      for (size_t i = 0; i < A.extent(0); ++i) {\n         const auto &e = A[i];\n         z[e.column] += e.value * r[e.row];\n      }\n   }\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) { x[i] = r[i]; });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solve_linear_system\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::View<double*> r(\"r\", N);\n   Kokkos::View<double*> z(\"z\", N);\n   Kokkos::parallel_for(\"init\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      r[i] = b[i];\n      z[i] = 0.0;\n   });\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      r[e.column] -= e.value * z[e.row];\n   }\n   for (size_t k = 0; k < A.extent(0); ++k) {\n      const COOElement &e = A[k];\n      z[e.row] += e.value * r[e.column];\n   }\n   Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = z[i];\n   });\n}\n",
+            "\n\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> A_h(A.data(), A.extent(0));\n   Kok\uf09e::View<const double*, Kok\uf09e::HostSpace> b_h(b.data(), b.extent(0));\n   Kok\uf09e::View<double*, Kok\uf09e::HostSpace> x_h(x.data(), x.extent(0));\n\n   Kok\uf09e::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      x_h[i] = b_h[i];\n      for (size_t j = A_h(i, 0); j < A_h(i, 1); ++j) {\n         x_h[i] -= A_h(i, 2 + j) * x_h[A_h(i, 2 + j)];\n      }\n      x_h[i] /= A_h(i, 1);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_host(\"x_host\", N);\n   Kokkos::parallel_for(\"solve\", N, KOKKOS_LAMBDA(size_t i) {\n      x_host(i) = 0.0;\n   });\n   Kokkos::deep_copy(x, x_host);\n   Kokkos::sparse_gmres(A_sparse, x, b, 1e-6, 100);\n\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double**> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"init_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < N; ++j) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"fill_A\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      A_sparse(A(i).row, A(i).column) = A(i).value;\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::cg(A_sparse, b, x_temp, 1e-6, 100);\n\n   // Copy the solution to x\n   Kokkos::parallel_for(\"copy_x\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      double sum = 0;\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            sum += A(j).value * b(A(j).column);\n         }\n      }\n      x(i) = sum;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_COO\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::View<double*> A_sparse(\"A_sparse\", N, N);\n   Kokkos::parallel_for(\"initialize_A\", N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         A_sparse(i, j) = 0.0;\n      }\n   });\n   Kokkos::parallel_for(\"populate_A\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      A_sparse(row_indices(i), column_indices(i)) = values(i);\n   });\n\n   // Solve the linear system\n   Kokkos::View<double*> x_temp(\"x_temp\", N);\n   Kokkos::sparse::CrsMatrix<double> A_crs(N, N, A_sparse);\n   Kokkos::sparse::solve(\"CrsMatrix\", A_crs, b, x_temp);\n\n   // Copy the result to x\n   Kokkos::parallel_for(\"copy_result\", N, KOKKOS_LAMBDA(size_t i) {\n      x(i) = x_temp(i);\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values = Kokkos::View<double*>(A.data(), A.extent(0));\n   Kokkos::View<size_t*> row_indices = Kokkos::View<size_t*>(A.data() + 1, A.extent(0));\n   Kokkos::View<size_t*> column_indices = Kokkos::View<size_t*>(A.data() + 2, A.extent(0));\n   Kokkos::SparseMatrix<double, Kokkos::LayoutLeft> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::SparseSolveCG<double, Kokkos::LayoutLeft>(A_sparse, x, b, 1e-6, 1000);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x(i) = b(i);\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x(A(j).column) -= A(j).value * x(i);\n         }\n      }\n      x(i) /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> col_indices(\"col_indices\", A.extent(0));\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values[i] = A(i).value;\n      row_indices[i] = A(i).row;\n      col_indices[i] = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, col_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::SparseSolveCG<double, Kokkos::SparseMatrix<double>>(A_sparse, x_copy, b, 1e-6, 1000);\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[A(j).column] -= A(j).value * x[i];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = A.extent(0); j--;) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   using team_member = KokKOS::TeamMember;\n   using team_view = KokKOS::TeamView<team_member>;\n   using team_scheduler = KokKOS::TeamScheduler<team_view>;\n\n   team_scheduler team(N);\n\n   team.parallel_for(\n      \"solve_linear_system\",\n      KOKKOS_LAMBDA(const team_member member) {\n         const size_t row = member.league_rank();\n         double sum = 0;\n         for (size_t i = 0; i < A.size(); ++i) {\n            if (A[i].row == row) {\n               sum += A[i].value * b[A[i].column];\n            }\n         }\n         x[row] = (b[row] - sum) / A[row].value;\n      });\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Kokkos::View<double*> values(\"values\", A.extent(0));\n   Kokkos::View<size_t*> row_indices(\"row_indices\", A.extent(0));\n   Kokkos::View<size_t*> column_indices(\"column_indices\", A.extent(0));\n   Kokkos::parallel_for(\"copy_coo\", A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      values(i) = A(i).value;\n      row_indices(i) = A(i).row;\n      column_indices(i) = A(i).column;\n   });\n   Kokkos::SparseMatrix<double> A_sparse(N, N, values, row_indices, column_indices);\n\n   // Solve the linear system\n   Kokkos::View<double*> x_copy(\"x_copy\", N);\n   Kokkos::deep_copy(x_copy, x);\n   Kokkos::sparse_solve(A_sparse, x_copy, b);\n\n   // Copy the solution back to x\n   Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.extent(0); j++) {\n         if (A(j).row == i) {\n            x[i] -= A(j).value * x[A(j).column];\n         }\n      }\n      x[i] /= A(i).value;\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            ".",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); j++) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      COOElement const& e = A[i];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      const COOElement& e = A[k];\n      x[e.column] -= e.value * x[e.row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            x[i] -= A[j].value * x[A[j].column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] -= element.value * x[element.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            x[i] -= e.value * x[e.column];\n         }\n      }\n      x[i] /= A[i].value;\n   }\n}\n",
+            "\n\n   x.resize(N);\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      x[A[i].column] -= A[i].value * x[A[i].row];\n   }\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      x[i] = 0;\n   }\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      // Add A[i].value * b[A[i].column] to x[A[i].row]\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n   x = y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> r(N, 0);\n   std::vector<size_t> row_start(N + 1, 0);\n   for (auto const& e : A) {\n      row_start[e.row + 1]++;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_start[i] += row_start[i - 1];\n   }\n   for (auto const& e : A) {\n      r[row_start[e.row]++] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = row_start[i]; j < row_start[i + 1]; j++) {\n         sum += r[j] * x[j];\n      }\n      x[i] = (b[i] - sum) / r[row_start[i]];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * x[e.column];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = (b[i] - r[i]) / A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / rowSum[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> r(N, 0);\n   for (auto const& e : A) {\n      r[e.row] += e.value * b[e.column];\n   }\n   x.resize(N);\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = r[i] / A[i].value;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sum(N, 0);\n   for (const COOElement& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0);\n   for (const COOElement& e : A) {\n      y[e.row] += e.value * b[e.column] / row_sum[e.row];\n   }\n   x = y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] += e.value;\n      } else {\n         offdiag[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / (diag[i] - offdiag[i]);\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         x[e.column] -= e.value * x[e.row] / (diag[e.column] - offdiag[e.column]);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> L(N, 0), U(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         L[i] += A_dense[i][j] * U[j];\n      }\n      U[i] = b[i] - L[i];\n   }\n   x[N - 1] = U[N - 1] / A_dense[N - 1][N - 1];\n   for (size_t i = N - 2; i >= 0; --i) {\n      double sum = 0;\n      for (size_t j = i + 1; j < N; ++j) {\n         sum += A_dense[i][j] * x[j];\n      }\n      x[i] = (U[i] - sum) / A_dense[i][i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> lower(N, 0);\n   std::vector<double> upper(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      } else if (e.row < e.column) {\n         lower[e.row] += e.value;\n      } else {\n         upper[e.row] += e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      double d = diag[i];\n      if (d == 0) {\n         throw std::runtime_error(\"Matrix is singular\");\n      }\n      x[i] = (b[i] - lower[i] - upper[i]) / d;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] / diag[i];\n      for (auto const& e : A) {\n         if (e.row == i && e.column != i) {\n            x[e.column] -= e.value * x[i] / row_sums[e.column];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& element : A) {\n      row_sum[element.row] += element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sum[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> diag(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         diag[e.row] = e.value;\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i];\n      for (auto const& e : A) {\n         if (e.row == i) {\n            for (size_t j = e.column + 1; j < N; j++) {\n               x[j] -= e.value * x[e.column];\n            }\n         }\n      }\n      x[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i > 0; i--) {\n      for (auto const& e : A) {\n         if (e.column == i) {\n            for (size_t j = 0; j < e.row; j++) {\n               x[j] -= e.value * x[e.row];\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1711445878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1342470694
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      x[i] = b[i] / row_sums[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> rowSum(N, 0.0);\n   for (COOElement const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   std::vector<double> y(N, 0.0);\n   for (COOElement const& e : A) {\n      y[e.row] += e.value * b[e.column] / rowSum[e.row];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] - y[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> A_dense(N, std::vector<double>(N, 0));\n   for (auto const& e : A) {\n      A_dense[e.row][e.column] = e.value;\n   }\n   std::vector<double> x_dense(N, 0);\n   solveLinearSystemDense(A_dense, b, x_dense);\n   x = x_dense;\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n\n   std::vector<double> y(N, 0);\n   for (auto const& e : A) {\n      y[e.column] += e.value * b[e.row] / row_sums[e.row];\n   }\n\n   x = y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sums(N, 0);\n   for (auto const& e : A) {\n      row_sums[e.row] += e.value;\n   }\n   std::vector<double> diag_inv(N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      diag_inv[i] = 1.0 / row_sums[i];\n   }\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = b[i] * diag_inv[i];\n      for (auto const& e : A) {\n         if (e.row == i) continue;\n         x[e.row] -= e.value * x[i] * diag_inv[e.row];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row_sum(N, 0);\n   for (auto const& e : A) {\n      row_sum[e.row] += e.value;\n   }\n   std::vector<double> x_tmp(N, 0);\n   for (auto const& e : A) {\n      x_tmp[e.column] += e.value * b[e.row] / row_sum[e.row];\n   }\n   x = x_tmp;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> diag(N, 0);\n   std::vector<double> offdiag(N, 0);\n   std::vector<size_t> rowptr(N + 1, 0);\n   for (auto const& e : A) {\n      rowptr[e.row + 1]++;\n      if (e.row == e.column)\n         diag[e.row] += e.value;\n      else\n         offdiag[e.row] += e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      rowptr[i + 1] += rowptr[i];\n   }\n   std::vector<COOElement> L(rowptr[N]);\n   std::vector<COOElement> U(rowptr[N]);\n   for (auto const& e : A) {\n      if (e.row == e.column)\n         continue;\n      if (e.row < e.column)\n         L[rowptr[e.row]++] = {e.row, e.column, e.value};\n      else\n         U[rowptr[e.column]++] = {e.row, e.column, e.value};\n   }\n   std::vector<double> y(N, 0);\n   for (size_t i = 0; i < N; i++) {\n      y[i] = b[i];\n      for (size_t j = rowptr[i]; j < rowptr[i + 1]; j++) {\n         y[i] -= L[j].value * x[L[j].column];\n      }\n      y[i] /= diag[i];\n   }\n   for (size_t i = N - 1; i < N; i--) {\n      x[i] = y[i];\n      for (size_t j = rowptr[i + 1]; j < rowptr[i + 2]; j++) {\n         x[i] -= U[j].value * x[U[j].column];\n      }\n      x[i] /= diag[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t col = A[tid].column;\n      double val = A[tid].value;\n      atomicAdd(&x[col], val * b[row]);\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int i = hipThreadIdx_x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t j = 0; j < sizeA; j++) {\n      if (A[j].column == i) {\n         x[A[j].row] -= A[j].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = b[i];\n   }\n   __syncthreads();\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == i) {\n         x[A[k].column] -= A[k].value * x[i];\n      }\n   }\n   __syncthreads();\n   if (i < N) {\n      x[i] /= A[i * N + i].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = hipThreadIdx_x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            ".",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < N) {\n      x[i] = 0;\n      for (size_t j = 0; j < sizeA; j++) {\n         if (A[j].row == i) {\n            x[i] += A[j].value * b[A[j].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   // Create a CSR matrix from the COO format\n   std::vector<size_t> row_offsets(N + 1, 0);\n   std::vector<size_t> column_indices;\n   std::vector<double> values;\n   for (auto const& element : A) {\n      row_offsets[element.row + 1]++;\n      column_indices.push_back(element.column);\n      values.push_back(element.value);\n   }\n   for (size_t i = 1; i < N + 1; i++) {\n      row_offsets[i] += row_offsets[i - 1];\n   }\n   \n   // Create a dense vector for the solution\n   x.resize(N);\n   \n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[row_offsets[j] + i]) > std::abs(values[row_offsets[pivot_row] + i])) {\n            pivot_row = j;\n         }\n      }\n      \n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         std::swap(row_offsets[pivot_row], row_offsets[i]);\n         std::swap(column_indices[row_offsets[pivot_row] - 1], column_indices[row_offsets[i] - 1]);\n         std::swap(values[row_offsets[pivot_row] - 1], values[row_offsets[i] - 1]);\n      }\n      \n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[row_offsets[j] + i] / values[row_offsets[i] + i];\n         for (size_t k = i; k < N; k++) {\n            values[row_offsets[j] + k] -= factor * values[row_offsets[i] + k];\n         }\n         b[j] -= factor * b[i];\n      }\n   }\n   \n   // Back-substitute to solve for x\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; j++) {\n         x[i] -= values[row_offsets[i] + j] * x[j];\n      }\n      x[i] /= values[row_offsets[i] + i];\n   }\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank != 0) {\n      MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double, Eigen::RowMajor> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector from b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector_b(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector_b(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd vector_x = matrix.ldlt().solve(vector_b);\n\n   // Copy the result to the output vector\n   for (size_t i = 0; i < N; ++i) {\n      x[i] = vector_x(i);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Create a sparse matrix object\n   MPI_Sparse_matrix A_mpi;\n   MPI_Sparse_matrix_create_coo(MPI_COMM_WORLD, N, N, A.size(), &A[0], &A_mpi);\n\n   // Create a dense vector object for b\n   MPI_Vector b_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &b_mpi);\n   MPI_Vector_set_values(b_mpi, N, &b[0]);\n\n   // Create a dense vector object for x\n   MPI_Vector x_mpi;\n   MPI_Vector_create(MPI_COMM_WORLD, N, &x_mpi);\n\n   // Solve the linear system\n   MPI_Sparse_solve(A_mpi, b_mpi, x_mpi);\n\n   // Get the result from rank 0\n   if (rank == 0) {\n      MPI_Vector_get_values(x_mpi, N, &x[0]);\n   }\n\n   // Free the MPI objects\n   MPI_Sparse_matrix_free(&A_mpi);\n   MPI_Vector_free(&b_mpi);\n   MPI_Vector_free(&x_mpi);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Scatter b to all ranks\n   std::vector<double> local_b(b.begin() + rank * b.size() / size, b.begin() + (rank + 1) * b.size() / size);\n\n   // Solve the local system Ax=b for x\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& e : A) {\n      local_x[e.row] += e.value * local_b[e.column];\n   }\n\n   // Gather the local solutions into x on rank 0\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank computes its part of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   \n   int *rows = new int[A.size()];\n   int *cols = new int[A.size()];\n   double *vals = new double[A.size()];\n   for (size_t i = 0; i < A.size(); i++) {\n      rows[i] = A[i].row;\n      cols[i] = A[i].column;\n      vals[i] = A[i].value;\n   }\n   \n   // Create a sparse matrix object\n   int nrows = N;\n   int ncols = N;\n   int nnz = A.size();\n   int *ia = new int[nrows + 1];\n   int *ja = new int[nnz];\n   double *a = new double[nnz];\n   \n   // Convert the COO format to CSR format\n   ia[0] = 0;\n   for (int i = 0; i < nnz; i++) {\n      ja[i] = cols[i];\n      a[i] = vals[i];\n   }\n   for (int i = 1; i < nrows + 1; i++) {\n      ia[i] = ia[i - 1];\n      for (int j = ia[i - 1]; j < nnz; j++) {\n         if (ja[j] >= i) {\n            ia[i] = j;\n            break;\n         }\n      }\n   }\n   \n   // Create a parallel sparse matrix object\n   Mat A_csr(nrows, ncols, nnz, ia, ja, a);\n   \n   // Create a parallel dense vector for b\n   Vec b_vec(nrows);\n   for (int i = 0; i < nrows; i++) {\n      b_vec[i] = b[i];\n   }\n   \n   // Create a parallel dense vector for x\n   Vec x_vec(nrows);\n   \n   // Solve the linear system\n   A_csr.solve(b_vec, x_vec);\n   \n   // Gather the solution from all ranks to rank 0\n   if (rank == 0) {\n      for (int i = 0; i < nrows; i++) {\n         x[i] = x_vec[i];\n      }\n   }\n   \n   // Clean up\n   delete[] rows;\n   delete[] cols;\n   delete[] vals;\n   delete[] ia;\n   delete[] ja;\n   delete[] a;\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements in each row of A\n   std::vector<size_t> rowCounts(N, 0);\n   for (auto const& e : A) {\n      rowCounts[e.row]++;\n   }\n\n   // Compute the offsets of each row in the CSR format\n   std::vector<size_t> rowOffsets(N, 0);\n   for (size_t i = 1; i < N; i++) {\n      rowOffsets[i] = rowOffsets[i - 1] + rowCounts[i - 1];\n   }\n\n   // Convert A to CSR format\n   std::vector<size_t> colIndices(A.size());\n   std::vector<double> values(A.size());\n   for (auto const& e : A) {\n      colIndices[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.column;\n      values[rowOffsets[e.row] + rowCounts[e.row] - 1] = e.value;\n      rowCounts[e.row]--;\n   }\n\n   // Solve the linear system using Gaussian elimination\n   for (size_t i = 0; i < N; i++) {\n      // Find the pivot row\n      size_t pivotRow = i;\n      for (size_t j = i + 1; j < N; j++) {\n         if (std::abs(values[rowOffsets[j] + rowCounts[j] - 1]) > std::abs(values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1])) {\n            pivotRow = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivotRow != i) {\n         std::swap(rowOffsets[i], rowOffsets[pivotRow]);\n         std::swap(rowCounts[i], rowCounts[pivotRow]);\n         std::swap(values[rowOffsets[i] + rowCounts[i] - 1], values[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n         std::swap(colIndices[rowOffsets[i] + rowCounts[i] - 1], colIndices[rowOffsets[pivotRow] + rowCounts[pivotRow] - 1]);\n      }\n\n      // Eliminate the current column\n      for (size_t j = i + 1; j < N; j++) {\n         double factor = values[rowOffsets[j] + rowCounts[j] - 1] / values[rowOffsets[i] + rowCounts[i] - 1];\n         for (size_t k = rowOffsets[j] + rowCounts[j] - 1; k >= rowOffsets[j]; k--) {\n            values[k] -= factor * values[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n            colIndices[k] = colIndices[rowOffsets[i] + rowCounts[i] - 1 + k - rowOffsets[j]];\n         }\n         rowCounts[j] = rowCounts[i] - 1;\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (size_t j = rowOffsets[i] + rowCounts[i] - 1; j < rowOffsets[i] + rowCounts[i]; j++) {\n         x[i] -= values[j] * x[colIndices[j]];\n      }\n      x[i] /= values[rowOffsets[i] + rowCounts[i] - 1];\n   }\n\n   // Gather the solution on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(&x[i * N / size], N / size, MPI_DOUBLE, i, 0, comm, MPI_STATUS_IGNORE);\n      }\n   } else {\n      MPI_Send(&x[rank * N / size], N / size, MPI_DOUBLE, 0, 0, comm);\n   }\n}\n",
+            "\n\n   // Create a new MPI datatype for COOElement\n   MPI_Datatype coo_element_type;\n   MPI_Type_contiguous(3, MPI_DOUBLE, &coo_element_type);\n   MPI_Type_commit(&coo_element_type);\n\n   // Scatter the COO matrix and b vector to all ranks\n   std::vector<COOElement> local_A(A.size() / MPI::COMM_WORLD.Get_size());\n   std::vector<double> local_b(b.size() / MPI::COMM_WORLD.Get_size());\n   MPI::COMM_WORLD.Scatter(A.data(), A.size(), coo_element_type, local_A.data(), local_A.size(), coo_element_type, 0);\n   MPI::COMM_WORLD.Scatter(b.data(), b.size(), MPI_DOUBLE, local_b.data(), local_b.size(), MPI_DOUBLE, 0);\n\n   // Solve the local linear system\n   std::vector<double> local_x(N);\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.row == i) {\n            sum += element.value * local_b[element.column];\n         }\n      }\n      local_x[i] = sum;\n   }\n\n   // Gather the local solutions to rank 0\n   MPI::COMM_WORLD.Gather(local_x.data(), local_x.size(), MPI_DOUBLE, x.data(), x.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the number of elements per rank\n   size_t num_elements_per_rank = A.size() / size;\n   // Compute the starting index of the elements for this rank\n   size_t start_index = rank * num_elements_per_rank;\n   // Compute the ending index of the elements for this rank\n   size_t end_index = (rank + 1) * num_elements_per_rank;\n\n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> local_A(A.begin() + start_index, A.begin() + end_index);\n\n   // Create a local copy of the vector b for this rank\n   std::vector<double> local_b(b.begin() + start_index, b.begin() + end_index);\n\n   // Create a local copy of the vector x for this rank\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < local_A.size(); i++) {\n      const COOElement& element = local_A[i];\n      local_x[element.row] += element.value * local_b[element.column];\n   }\n\n   // Reduce the local solutions to the global solution on rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local row indices and values for this rank\n   std::vector<size_t> local_rows;\n   std::vector<double> local_values;\n   for (auto const& element : A) {\n      if (element.row / N == rank) {\n         local_rows.push_back(element.row % N);\n         local_values.push_back(element.value);\n      }\n   }\n\n   // Create a sparse matrix in CSR format\n   std::vector<size_t> csr_row_offsets(size + 1);\n   std::vector<size_t> csr_column_indices(A.size());\n   std::vector<double> csr_values(A.size());\n   MPI_Allgather(&local_rows[0], local_rows.size(), MPI_UNSIGNED_LONG, &csr_column_indices[0], local_rows.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   MPI_Allgather(&local_values[0], local_values.size(), MPI_DOUBLE, &csr_values[0], local_values.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n   for (size_t i = 0; i < size; ++i) {\n      csr_row_offsets[i + 1] = csr_row_offsets[i] + local_rows.size();\n   }\n\n   // Solve the linear system using the CSR matrix\n   std::vector<double> local_x(N);\n   if (rank == 0) {\n      x.resize(N);\n   }\n   sparse_solve(csr_row_offsets, csr_column_indices, csr_values, b, local_x);\n\n   // Gather the solution from all ranks\n   MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n\n   // Compute the local number of rows and the local row indices\n   size_t local_N = N / size;\n   size_t start_row = rank * local_N;\n   size_t end_row = (rank + 1) * local_N;\n\n   // Compute the local contribution to the solution\n   std::vector<double> local_x(local_N);\n   for (size_t i = start_row; i < end_row; ++i) {\n      local_x[i - start_row] = b[i];\n      for (COOElement const& e : A) {\n         if (e.row == i) {\n            local_x[i - start_row] -= e.value * x[e.column];\n         }\n      }\n      local_x[i - start_row] /= A[i].value;\n   }\n\n   // Gather the local solutions into the global solution\n   MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), local_N, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   // Create a matrix object from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a vector object from the dense vector\n   Eigen::VectorXd vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd result = matrix.ldlt().solve(vector);\n\n   // Copy the result to the output vector\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = result(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   MPI_Comm comm = MPI_COMM_WORLD;\n   int rank, size;\n   MPI_Comm_rank(comm, &rank);\n   MPI_Comm_size(comm, &size);\n   int nrows = N / size + (rank < N % size);\n   std::vector<COOElement> localA(A.begin() + rank * (N / size), A.begin() + rank * (N / size) + nrows);\n   std::vector<double> localb(b.begin() + rank * (N / size), b.begin() + rank * (N / size) + nrows);\n   std::vector<double> localx(nrows);\n   \n   // Solve the local system\n   for (size_t i = 0; i < nrows; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].row == i) {\n            sum += localA[j].value * localx[localA[j].column];\n         }\n      }\n      localx[i] = (localb[i] - sum) / localA[i].value;\n   }\n   \n   // Gather the local solutions\n   MPI_Gather(localx.data(), nrows, MPI_DOUBLE, x.data(), nrows, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n\n   // Compute the local part of the matrix\n   std::vector<COOElement> local_A(A.begin() + rank * local_size, A.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the vector b\n   std::vector<double> local_b(b.begin() + rank * local_size, b.begin() + (rank + 1) * local_size);\n\n   // Compute the local part of the solution vector\n   std::vector<double> local_x(N);\n\n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (auto& element : local_A) {\n         if (element.column == i) {\n            sum += element.value * local_x[element.row];\n         }\n      }\n      local_x[i] = (local_b[i] - sum) / local_A[i].value;\n   }\n\n   // Gather the local solutions into the global solution vector\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Compute the number of elements per rank\n   size_t elementsPerRank = A.size() / size;\n   \n   // Compute the starting index of the elements for this rank\n   size_t startIndex = rank * elementsPerRank;\n   \n   // Compute the ending index of the elements for this rank\n   size_t endIndex = (rank + 1) * elementsPerRank;\n   \n   // Create a local copy of the elements for this rank\n   std::vector<COOElement> localA(A.begin() + startIndex, A.begin() + endIndex);\n   \n   // Create a local copy of the vector b for this rank\n   std::vector<double> localb(b.begin() + startIndex, b.begin() + endIndex);\n   \n   // Create a local copy of the vector x for this rank\n   std::vector<double> localx(N);\n   \n   // Solve the local linear system\n   for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < localA.size(); j++) {\n         if (localA[j].column == i) {\n            sum += localA[j].value * localb[j];\n         }\n      }\n      localx[i] = sum;\n   }\n   \n   // Gather the local solutions into the global solution\n   MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the local portion of the result\n   std::vector<double> local_x(N, 0.0);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n\n   // Reduce the local results to rank 0\n   MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n   // Only rank 0 has the complete result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements in the local part of the matrix\n   size_t local_size = A.size() / size;\n   size_t start_index = rank * local_size;\n   size_t end_index = (rank + 1) * local_size;\n\n   // Compute the local part of the matrix-vector product\n   std::vector<double> local_Ax(N, 0.0);\n   for (size_t i = start_index; i < end_index; ++i) {\n      local_Ax[A[i].column] += A[i].value * b[A[i].row];\n   }\n\n   // Reduce the local results to the global result\n   MPI_Allreduce(local_Ax.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n   // Rank 0 stores the final result\n   if (rank == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] /= A[i].value;\n      }\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   \n   // Each rank computes its part of the result\n   std::vector<double> local_x(N);\n   for (auto const& element : A) {\n      if (element.row == rank) {\n         local_x[element.column] += element.value * b[element.row];\n      }\n   }\n   \n   // Gather the results from all ranks\n   MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format with sizeA elements. x and b are dense vectors with N elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\n__global__ void solveLinearSystem(const COOElement *A, size_t sizeA, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int row = tid / N;\n   int col = tid % N;\n   double sum = 0;\n   for (size_t i = 0; i < sizeA; i++) {\n      if (A[i].row == row && A[i].column == col) {\n         sum += A[i].value * x[A[i].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[tid].value;\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   // Each thread computes one element of x.\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N && col < N) {\n      double sum = 0;\n      for (int k = A[row].column[col]; k < A[row].column[col + 1]; ++k) {\n         sum += A[row].value[k] * x[A[row].column[k]];\n      }\n      b[row] = (b[row] - sum) / A[row].value[A[row].column[col + 1] - 1];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; ++i) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row * N + row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * b[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   double sum = 0;\n   for (int k = 0; k < size; ++k) {\n      if (A[k].row == row && A[k].column == col) {\n         sum += A[k].value * x[A[k].column];\n      }\n   }\n   x[row] = (b[row] - sum) / A[size - 1].value;\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int tid_offset = tid + bid * blockDim.x;\n   if (tid_offset < sizeA) {\n      size_t row = A[tid_offset].row;\n      size_t column = A[tid_offset].column;\n      double value = A[tid_offset].value;\n      atomicAdd(&x[column], value * b[row]);\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = b[row];\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] -= A[i].value * x[A[i].column];\n         }\n      }\n      x[row] /= A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int row = tid + bid * blockDim.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            sum += A[i].value * x[A[i].column];\n         }\n      }\n      x[row] = (b[row] - sum) / A[row].value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      x[row] = 0.0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            x[row] += A[i].value * b[A[i].column];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "45_sparse_la_sparse_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n   A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n   output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector<COOElement> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "&",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the products of A and b for each row\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = globalRowSums[i];\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n   matrix.makeCompressed();\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_eigen(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_eigen(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_eigen = matrix.ldlt().solve(b_eigen);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_eigen(i);\n      }\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& e : A) {\n      A_sparse.insert(e.row, e.column) = e.value;\n   }\n   A_sparse.makeCompressed();\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system using Eigen\n   x_dense = A_sparse.colPivHouseholderQr().solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Get the number of processes and the rank of the current process\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Compute the number of elements per process\n   int elementsPerProcess = N / size;\n\n   // Create a buffer to store the local part of A and b\n   std::vector<COOElement> localA(elementsPerProcess);\n   std::vector<double> localb(elementsPerProcess);\n\n   // Scatter A and b to all processes\n   MPI_Scatter(A.data(), elementsPerProcess, MPI_DOUBLE, localA.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Scatter(b.data(), elementsPerProcess, MPI_DOUBLE, localb.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Create a local copy of x\n   std::vector<double> localx(N, 0.0);\n\n   // Perform the Gaussian elimination in parallel\n   #pragma omp parallel for\n   for (int i = 0; i < N; i++) {\n      // Find the pivot element\n      int pivotRow = i;\n      double maxPivot = std::abs(localA[i].value);\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         if (std::abs(localA[j].value) > maxPivot) {\n            pivotRow = j;\n            maxPivot = std::abs(localA[j].value);\n         }\n      }\n\n      // Swap the pivot row with the first row\n      std::swap(localA[i], localA[pivotRow]);\n      std::swap(localb[i], localb[pivotRow]);\n\n      // Eliminate the non-zero elements below the pivot\n      for (int j = i + 1; j < elementsPerProcess; j++) {\n         double factor = localA[j].value / localA[i].value;\n         localA[j].value -= factor * localA[i].value;\n         localb[j] -= factor * localb[i];\n      }\n   }\n\n   // Gather the local solutions into a global solution\n   MPI_Gather(localx.data(), elementsPerProcess, MPI_DOUBLE, x.data(), elementsPerProcess, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Perform the back substitution to solve for x\n   for (int i = N - 1; i >= 0; i--) {\n      double sum = 0.0;\n      for (int j = i + 1; j < N; j++) {\n         sum += localA[j].value * x[j];\n      }\n      x[i] = (localb[i] - sum) / localA[i].value;\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      x[A[i].row] += A[i].value * b[A[i].column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0);\n\n   // Compute the sum of the products of A and x for each row\n   std::vector<double> rowSums(N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value * x[A[i].column];\n   }\n\n   // Gather the row sums to rank 0\n   std::vector<double> globalRowSums(N);\n   MPI_Allgather(rowSums.data(), N, MPI_DOUBLE, globalRowSums.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n   // Solve the system of equations on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = (b[i] - globalRowSums[i]) / A[i].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Iterate over the elements of A\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); i++) {\n      const COOElement& element = A[i];\n      x[element.row] += element.value * b[element.column];\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(&rowSums[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "<}",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Step 1: Distribute the rows of A and b to each rank\n  std::vector<COOElement> localA;\n  std::vector<double> localb;\n  distributeRows(A, b, localA, localb, rank, size, N);\n\n  // Step 2: Solve the local system on each rank\n  std::vector<double> localx(N, 0.0);\n  for (auto& element : localA) {\n    localx[element.row] += element.value * localb[element.column];\n  }\n\n  // Step 3: Gather the local solutions to rank 0\n  MPI_Gather(localx.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Seidel iteration\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      double sum = 0.0;\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i) {\n            sum += A[j].value * x[A[j].column];\n         }\n      }\n      x[i] = (b[i] - sum) / A[i].value;\n   }\n\n   // Gather the results from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.resize(N);\n   std::fill(x.begin(), x.end(), 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the ith row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j], A[pivot_row * N + j]);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the ith column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value / pivot;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Solve the triangular system\n   for (size_t i = N - 1; i >= 0; --i) {\n      x[i] = b[i];\n      for (size_t j = i + 1; j < N; ++j) {\n         x[i] -= A[i * N + j].value * x[j];\n      }\n      x[i] /= A[i * N + i].value;\n   }\n\n   // Gather the solution on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, &x[0], N, MPI::DOUBLE, 0);\n   } else {\n      MPI::COMM_WORLD.Gather(&x[0], N, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> matrix(N, N);\n   for (auto const& element : A) {\n      matrix.insert(element.row, element.column) = element.value;\n   }\n\n   // Create a dense vector for b\n   Eigen::VectorXd b_vector(N);\n   for (size_t i = 0; i < N; ++i) {\n      b_vector(i) = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::VectorXd x_vector = matrix.ldlt().solve(b_vector);\n\n   // Copy the result to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_vector(i);\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0);\n\n   // Each rank computes its part of the result\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (auto const& element : A) {\n         if (element.row == i) {\n            x[i] += element.value * b[element.column];\n         }\n      }\n   }\n\n   // Gather the results to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the local contribution of each rank to the solution\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      const COOElement& e = A[i];\n      x[e.row] += e.value * b[e.column];\n   }\n\n   // Reduce the local contributions to the global solution on rank 0\n   MPI_Reduce(x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Create a sparse matrix from the COO format\n   Eigen::SparseMatrix<double> A_sparse(N, N);\n   for (auto const& element : A) {\n      A_sparse.insert(element.row, element.column) = element.value;\n   }\n\n   // Create dense vectors for x and b\n   Eigen::VectorXd x_dense(N), b_dense(N);\n   for (size_t i = 0; i < N; ++i) {\n      x_dense[i] = x[i];\n      b_dense[i] = b[i];\n   }\n\n   // Solve the linear system\n   Eigen::SimplicialLDLT<Eigen::SparseMatrix<double>> solver;\n   solver.compute(A_sparse);\n   x_dense = solver.solve(b_dense);\n\n   // Copy the result back to x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < N; ++i) {\n         x[i] = x_dense[i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Compute the sum of the values in each row of A\n   std::vector<double> rowSums(N, 0.0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      rowSums[A[i].row] += A[i].value;\n   }\n\n   // Broadcast rowSums to all ranks\n   MPI_Bcast(rowSums.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Compute x on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (size_t i = 0; i < A.size(); ++i) {\n         x[A[i].row] += A[i].value * b[A[i].column] / rowSums[A[i].row];\n      }\n   }\n\n   // Gather x from all ranks to rank 0\n   MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize x to 0 on all ranks\n   x.assign(N, 0.0);\n\n   // Perform Gauss-Jordan elimination in parallel\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      // Find the pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; ++j) {\n         if (std::abs(A[j * N + i].value) > std::abs(A[pivot_row * N + i].value)) {\n            pivot_row = j;\n         }\n      }\n\n      // Swap the pivot row with the current row\n      if (pivot_row != i) {\n         for (size_t j = 0; j < N; ++j) {\n            std::swap(A[i * N + j].value, A[pivot_row * N + j].value);\n         }\n         std::swap(b[i], b[pivot_row]);\n      }\n\n      // Eliminate the current column\n      double pivot = A[i * N + i].value;\n      for (size_t j = 0; j < N; ++j) {\n         A[i * N + j].value /= pivot;\n      }\n      b[i] /= pivot;\n      for (size_t j = 0; j < N; ++j) {\n         if (j != i) {\n            double factor = A[j * N + i].value;\n            for (size_t k = 0; k < N; ++k) {\n               A[j * N + k].value -= factor * A[i * N + k].value;\n            }\n            b[j] -= factor * b[i];\n         }\n      }\n   }\n\n   // Gather the solution from all ranks to rank 0\n   MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(Kokkos::View<const COOElement*> &A, Kokkos::View<double**> &L, Kokkos::View<double**> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(A.extent(0), KOKKOS_LAMBDA(size_t i) {\n      // Get the row of A\n      const COOElement* row = &A(i);\n\n      // Iterate over the columns of the row\n      for (size_t j = 0; j < row->column; ++j) {\n         // Get the value of the element\n         double value = row->value;\n\n         // Update L and U\n         for (size_t k = 0; k < i; ++k) {\n            L(i, k) -= value * U(k, j);\n         }\n         U(i, j) = value;\n      }\n   });\n}\n",
+            "\n\n   // Create a parallel for loop to iterate over the rows of A\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n      // Initialize the ith row of L and U to the identity matrix\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n\n      // Iterate over the non-zero elements in the ith row of A\n      for (size_t j = 0; j < A.extent(0); ++j) {\n         if (A(j).row == i) {\n            // Update the ith row of L and U\n            L(i, A(j).column) = A(j).value;\n            U(i, A(j).column) = A(j).value;\n\n            // Iterate over the rows below the ith row\n            for (size_t k = i + 1; k < N; ++k) {\n               // Update the kth row of L and U\n               L(k, A(j).column) -= A(j).value * L(k, i);\n               U(k, A(j).column) -= A(j).value * U(k, i);\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = A(i, i).value;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L(i, k) = A(i, k).value / U(k, k);\n         for (size_t j = k + 1; j < N; ++j) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Iterate over the rows of A\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      // Iterate over the columns of A\n      for (size_t j = 0; j < N; j++) {\n         // If A(i, j) is non-zero, then update L and U\n         if (A(i, j).value != 0.0) {\n            // Update L\n            L(i, j) = A(i, j).value / U(j, j);\n\n            // Update U\n            for (size_t k = j + 1; k < N; k++) {\n               U(j, k) -= L(i, j) * A(i, k).value;\n            }\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::Rank<2>>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n      if (i == j) {\n         L(i, j) = 1.0;\n         U(i, j) = A(i, j);\n      } else if (i > j) {\n         L(i, j) = A(i, j) / U(j, j);\n         U(i, j) = 0.0;\n      } else {\n         U(i, j) = A(i, j) / L(i, i);\n         L(i, j) = 0.0;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(j, i) -= L(j, k) * U(k, i);\n         }\n         L(j, i) /= U(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Create a copy of A\n   auto Acopy = KokKOS::make_view(A);\n\n   // Perform Gaussian elimination\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n         Acopy(j, i) -= Acopy(j, i) * Acopy(i, i);\n      }\n   }\n\n   // Store L and U\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L(i, j) = 1.0;\n         } else if (i < j) {\n            L(i, j) = Acopy(i, j);\n         } else {\n            U(i, j) = Acopy(i, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA (const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0.0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            L(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n      for (size_t j = i + 1; j < N; ++j) {\n         for (size_t k = 0; k < i; ++k) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         U(i, j) /= L(i, i);\n      }\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            U(i, j) += A(i, j).value;\n         } else {\n            L(i, j) += A(i, j).value;\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0;\n         U(i, j) = 0;\n      }\n      L(i, i) = 1;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>(N, Kokkos::AUTO), KOKKOS_LAMBDA(const Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace>::member_type &team) {\n      const size_t i = team.league_rank();\n      for (size_t j = 0; j < N; j++) {\n         for (size_t k = 0; k < i; k++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n         if (i == j) {\n            L(i, i) = 1 / U(i, i);\n         } else {\n            L(i, j) = -U(i, j) * L(j, j);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < i; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < j; k++) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; j++) {\n         double sum = 0;\n         for (size_t k = 0; k < i; k++) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const size_t k) {\n      const auto &e = A(k);\n      const double a_kk = U(e.row, e.row);\n      const double l_ik = L(e.column, e.row);\n      U(e.row, e.column) = e.value - l_ik * a_kk;\n      for (size_t i = e.row + 1; i < N; i++) {\n         const double l_ik = L(e.column, e.row);\n         const double a_ki = U(e.row, i);\n         L(i, e.column) = (e.value - a_ki * a_kk) / l_ik;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t row) {\n      double sumL = 0.0;\n      double sumU = 0.0;\n      for (size_t i = 0; i < row; i++) {\n         sumL += L(row, i) * U(i, row);\n         sumU += L(i, row) * U(row, i);\n      }\n      L(row, row) = 1.0;\n      U(row, row) = A(row, row) - sumU;\n      for (size_t col = row + 1; col < N; col++) {\n         L(row, col) = (A(row, col) - sumL) / U(row, row);\n         U(row, col) = A(row, col) - sumU;\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int& i) {\n      for (size_t j = 0; j < N; j++) {\n         L(i, j) = 0.0;\n         U(i, j) = 0.0;\n      }\n      L(i, i) = 1.0;\n   });\n\n   Kokkos::parallel_for(\"luFactorize\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, A.extent(0)), KOKKOS_LAMBDA(const int& k) {\n      const COOElement& element = A(k);\n      const size_t i = element.row;\n      const size_t j = element.column;\n      const double value = element.value;\n      U(i, j) = value;\n      for (size_t p = 0; p < i; p++) {\n         L(i, p) += value * L(j, p);\n         U(j, p) -= value * U(i, p);\n      }\n      L(i, i) -= value * L(j, i);\n      U(j, j) += value * U(i, i);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L(i, k) * U(k, j);\n         }\n         L(i, j) = (A(i, j) - sum) / U(j, j);\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L(i, k) * U(k, i);\n      }\n      U(i, i) = A(i, i) - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L(j, k) * U(k, i);\n         }\n         U(i, j) = (A(j, i) - sum) / L(i, i);\n      }\n   });\n}\n",
+            "\n\n   // Initialize L and U to identity matrices\n   Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      L(i, i) = 1.0;\n      U(i, i) = 1.0;\n   });\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U using the k-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         L(i, k) = A(i, k) / U(k, k);\n         for (size_t j = k + 1; j < N; j++) {\n            U(i, j) -= L(i, k) * U(k, j);\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + i] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sumL = 0, sumU = 0;\n      for (size_t k = 0; k < i; k++) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization in parallel\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma opm parallel for\n   for (size_t k = 0; k < A.size(); ++k) {\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n      #pragma opm atomic\n      L[row * N + column] += value;\n      #pragma opm atomic\n      U[column * N + row] += value;\n   }\n   #pragma opm parallel for\n   for (size_t i = 0; i < N; ++i) {\n      #pragma opm atomic\n      L[i * N + i] = 1;\n   }\n}\n",
+            "<=====,",
+            "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   #pragma omp parallel for\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      if (row == column)\n         L[row * N + column] = 1;\n      if (row < column)\n         U[row * N + column] = value;\n      if (row > column)\n         L[row * N + column] = value;\n   }\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n      }\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = 0; j < k; ++j) {\n            L[i * N + k] -= U[i * N + j] * L[j * N + k];\n         }\n         L[i * N + k] /= U[k * N + k];\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < A.size(); i++) {\n    const COOElement &e = A[i];\n    if (e.row == e.column) {\n      L[e.row * N + e.column] = 1.0;\n      U[e.row * N + e.column] = e.value;\n    } else if (e.row < e.column) {\n      L[e.row * N + e.column] = e.value / U[e.row * N + e.row];\n      U[e.column * N + e.row] = 0.0;\n    } else {\n      U[e.row * N + e.column] = e.value;\n      L[e.column * N + e.row] = 0.0;\n    }\n  }\n\n  #pragma omp parallel for\n  for (size_t k = 1; k < N; k++) {\n    for (size_t i = k; i < N; i++) {\n      for (size_t j = 0; j < k; j++) {\n        U[i * N + k] -= L[i * N + j] * U[j * N + k];\n      }\n    }\n    for (size_t j = k + 1; j < N; j++) {\n      L[k * N + j] -= L[k * N + k] * U[k * N + j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sumL = 0.0, sumU = 0.0;\n      for (size_t k = 0; k < i; ++k) {\n        sumL += L[i * N + k] * U[k * N + j];\n        sumU += L[k * N + i] * U[k * N + j];\n      }\n      L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n      U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            U[i * N + j] = A[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += L[i * N + k] * U[k * N + j];\n      }\n      if (i == j) {\n        L[i * N + i] = 1;\n        U[i * N + i] = A[i].value - sum;\n      } else {\n        L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n        U[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < i; k++) {\n            sumL += L[i * N + k] * U[k * N + j];\n            sumU += L[k * N + j] * U[i * N + k];\n         }\n         L[i * N + j] = (A[i * N + j].value - sumL) / U[i * N + i];\n         U[i * N + j] = (A[i * N + j].value - sumU) / L[i * N + i];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform the factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n      }\n\n      // Update U\n      for (size_t j = k + 1; j < N; j++) {\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n            L[i * N + j] = A[i * N + j] / L[k * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i][k] * U[k][j];\n         }\n         L[i][j] = (A[i][j] - sum) / U[j][j];\n      }\n      double sum = 0;\n      for (size_t k = 0; k < i; ++k) {\n         sum += L[i][k] * U[k][i];\n      }\n      U[i][i] = A[i][i] - sum;\n      for (size_t j = i + 1; j < N; ++j) {\n         double sum = 0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[j][k] * U[k][j];\n         }\n         U[i][j] = (A[i][j] - sum) / L[i][i];\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[j * N + k] / L[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            L[i * N + j] -= U[k * N + j] * L[i * N + k];\n            A[i * N + j] -= U[k * N + j] * A[i * N + k];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(e.row, e.column, 1);\n         U.emplace_back(e.row, e.column, e.value);\n      } else {\n         L.emplace_back(e.row, e.column, e.value / rowSum[e.row]);\n         U.emplace_back(e.row, e.column, 0);\n      }\n   }\n   for (size_t i = 1; i < N; i++) {\n      for (size_t j = 0; j < i; j++) {\n         for (size_t k = 0; k < N; k++) {\n            U[i * N + k] -= L[i * N + j] * U[j * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1;\n         U[e.row * N + e.column] = e.value;\n      } else {\n         row[e.column] = e.value / U[e.row * N + e.row];\n         for (size_t i = e.column; i < N; ++i) {\n            L[e.row * N + i] = row[i];\n            U[e.row * N + i] -= row[i] * U[e.column * N + i];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = (e.row == e.column) ? 1 : 0;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = 0; i < k; ++i) {\n            U[k * N + j] -= L[k * N + i] * U[i * N + j];\n         }\n         L[j * N + k] = U[j * N + k] / U[k * N + k];\n         for (size_t i = k + 1; i < N; ++i) {\n            U[j * N + i] -= L[j * N + k] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[i * N + j] != 0) {\n            double factor = L[i * N + j] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L[i * N + j] = 1;\n         } else if (i > j) {\n            L[i * N + j] = L_temp[i][j] / L_temp[j][j];\n            for (size_t k = 0; k < N; k++) {\n               U_temp[i][k] -= L[i * N + j] * U_temp[j][k];\n            }\n         } else {\n            U[i * N + j] = U_temp[i][j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.4241596746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0236263276
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.clear();\n   U.clear();\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      rowSum[e.row] += e.value;\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back(1);\n         U.push_back(e.value / rowSum[e.row]);\n      } else if (e.row < e.column) {\n         L.push_back(0);\n         U.push_back(e.value / rowSum[e.row]);\n      } else {\n         L.push_back(e.value / rowSum[e.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   // Initialize L and U to be identity matrices\n   L.resize(N * N);\n   U.resize(N * N);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n      U[i * N + i] = 1;\n   }\n\n   // Iterate over the elements of A\n   for (auto const& element : A) {\n      // Update L and U\n      for (size_t i = 0; i < element.row; i++) {\n         L[element.row * N + i] -= L[element.column * N + i] * element.value;\n      }\n      for (size_t j = element.column + 1; j < N; j++) {\n         U[element.row * N + j] -= U[element.column * N + j] * element.value;\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<size_t>> row_ptr(N + 1);\n   std::vector<size_t> col_idx(A.size());\n   std::vector<double> values(A.size());\n   for (size_t i = 0; i < A.size(); i++) {\n      row_ptr[A[i].row + 1]++;\n      col_idx[i] = A[i].column;\n      values[i] = A[i].value;\n   }\n   for (size_t i = 1; i <= N; i++) {\n      row_ptr[i] += row_ptr[i - 1];\n   }\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < A.size(); i++) {\n      size_t row = A[i].row;\n      size_t col = A[i].column;\n      double value = A[i].value;\n      U[row_ptr[row] + col - 1] = value;\n      for (size_t j = row_ptr[row]; j < row_ptr[row + 1]; j++) {\n         if (col_idx[j] < col) {\n            L[row_ptr[col] + col_idx[j] - 1] += value * U[row_ptr[row] + col_idx[j] - 1];\n         } else if (col_idx[j] > col) {\n            U[row_ptr[row] + col_idx[j] - 1] -= value * L[row_ptr[col] + col_idx[j] - 1];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; i++) {\n      L[row_ptr[i] + i - 1] = 1;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::vector<size_t> L_row, L_col, U_row, U_col;\n   std::vector<double> L_val, U_val;\n   L_row.reserve(A.size());\n   L_col.reserve(A.size());\n   L_val.reserve(A.size());\n   U_row.reserve(A.size());\n   U_col.reserve(A.size());\n   U_val.reserve(A.size());\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         double sumL = 0, sumU = 0;\n         for (size_t k = 0; k < A.size(); k++) {\n            if (A[k].row == i && A[k].column == j) {\n               sumU = A[k].value;\n               U_row.push_back(i);\n               U_col.push_back(j);\n               U_val.push_back(sumU);\n            }\n            if (A[k].row == i && A[k].column < j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n            if (A[k].row < i && A[k].column == j) {\n               sumL += A[k].value;\n               L_row.push_back(i);\n               L_col.push_back(j);\n               L_val.push_back(sumL);\n            }\n         }\n         if (sumU == 0) {\n            U_row.push_back(i);\n            U_col.push_back(j);\n            U_val.push_back(0);\n         }\n         if (sumL == 0) {\n            L_row.push_back(i);\n            L_col.push_back(j);\n            L_val.push_back(0);\n         }\n      }\n   }\n   L.resize(L_row.size());\n   U.resize(U_row.size());\n   for (size_t i = 0; i < L_row.size(); i++) {\n      L[i].row = L_row[i];\n      L[i].column = L_col[i];\n      L[i].value = L_val[i];\n   }\n   for (size_t i = 0; i < U_row.size(); i++) {\n      U[i].row = U_row[i];\n      U[i].column = U_col[i];\n      U[i].value = U_val[i];\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> L_matrix(N, std::vector<double>(N, 0.0));\n   std::vector<std::vector<double>> U_matrix(N, std::vector<double>(N, 0.0));\n\n   for (auto const& element : A) {\n      L_matrix[element.row][element.column] = element.value;\n      U_matrix[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         if (i == j) {\n            L_matrix[i][j] = 1.0;\n         } else if (i > j) {\n            L_matrix[i][j] = 0.0;\n         } else {\n            U_matrix[i][j] = 0.0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = L_matrix[i][k] / L_matrix[k][k];\n         for (size_t j = k; j < N; ++j) {\n            L_matrix[i][j] -= factor * U_matrix[k][j];\n            U_matrix[i][j] -= factor * L_matrix[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L[i * N + j] = L_matrix[i][j];\n         U[i * N + j] = U_matrix[i][j];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   for (size_t i = 0; i < A.size(); i++) {\n      L[A[i].row * N + A[i].column] = A[i].value;\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = L[i * N + k] / L[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] = U[i * N + j] - L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> row(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.emplace_back(COOElement{e.row, e.column, 1});\n         row[e.column] = e.value;\n      } else {\n         U.emplace_back(COOElement{e.row, e.column, e.value});\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row != e.column) {\n         L.emplace_back(COOElement{e.row, e.column, e.value / row[e.column]});\n         for (auto& u : U) {\n            if (u.row == e.row && u.column == e.column) {\n               u.value -= e.value * L.back().value;\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      size_t row = A[i].row;\n      size_t column = A[i].column;\n      double value = A[i].value;\n      U[row * N + column] = value;\n      for (size_t j = 0; j < row; ++j) {\n         L[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n      for (size_t j = row + 1; j < N; ++j) {\n         U[row * N + j] -= L[i * N + j] * value / U[j * N + j];\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; ++i) {\n      L_temp[i][i] = 1;\n   }\n   for (size_t k = 0; k < N; ++k) {\n      for (size_t i = k + 1; i < N; ++i) {\n         L_temp[i][k] = L_temp[i][k] / L_temp[k][k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U_temp[k][j] = U_temp[k][j] - L_temp[i][k] * U_temp[i][j];\n         }\n      }\n   }\n   for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.clear();\n   U.clear();\n   L.reserve(A.size());\n   U.reserve(A.size());\n   std::vector<double> rowSum(N, 0);\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         rowSum[e.row] += e.value;\n      }\n   }\n   for (auto const& e : A) {\n      if (e.row == e.column) {\n         L.push_back({e.row, e.column, 1});\n         U.push_back({e.row, e.column, e.value / rowSum[e.row]});\n      } else if (e.row < e.column) {\n         L.push_back({e.row, e.column, e.value / rowSum[e.row]});\n         U.push_back({e.row, e.column, 0});\n      } else {\n         L.push_back({e.row, e.column, 0});\n         U.push_back({e.row, e.column, e.value});\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   std::vector<std::vector<double>> L_temp(N, std::vector<double>(N, 0));\n   std::vector<std::vector<double>> U_temp(N, std::vector<double>(N, 0));\n\n   for (auto const& element : A) {\n      L_temp[element.row][element.column] = element.value;\n      U_temp[element.row][element.column] = element.value;\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         if (i == j) {\n            L_temp[i][j] = 1;\n         } else if (i > j) {\n            L_temp[i][j] = 0;\n         }\n      }\n   }\n\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U_temp[i][k] / U_temp[k][k];\n         L_temp[i][k] = factor;\n         for (size_t j = k; j < N; j++) {\n            U_temp[i][j] -= factor * U_temp[k][j];\n         }\n      }\n   }\n\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n         L.push_back(L_temp[i][j]);\n         U.push_back(U_temp[i][j]);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto &e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (L[j * N + i] != 0) {\n            double factor = L[j * N + i] / L[i * N + i];\n            for (size_t k = i; k < N; k++) {\n               L[j * N + k] -= factor * L[i * N + k];\n               U[i * N + k] -= factor * U[j * N + k];\n            }\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   std::vector<double> rowSum(N, 0);\n   for (auto const& element : A) {\n      rowSum[element.row] += element.value;\n   }\n\n   for (auto const& element : A) {\n      if (element.row == element.column) {\n         L.push_back(1);\n         U.push_back(element.value / rowSum[element.row]);\n      } else if (element.row < element.column) {\n         L.push_back(0);\n         U.push_back(element.value / rowSum[element.row]);\n      } else {\n         L.push_back(element.value / rowSum[element.column]);\n         U.push_back(0);\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (auto const& element : A) {\n      L[element.row * N + element.column] = element.value;\n      U[element.row * N + element.column] = element.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         double multiplier = L[j * N + i] / L[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            L[j * N + k] -= multiplier * L[i * N + k];\n            U[i * N + k] -= multiplier * U[j * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (auto const& e : A) {\n      L[e.row * N + e.column] = e.value;\n      U[e.row * N + e.column] = e.value;\n   }\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n         L[j * N + i] = U[j * N + i] / U[i * N + i];\n         for (size_t k = i; k < N; k++) {\n            U[j * N + k] -= L[j * N + i] * U[i * N + k];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         U[row * N + col] = A[row * N + col];\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   COOElement element = A[tid];\n   int row = element.row;\n   int col = element.column;\n   double val = element.value;\n\n   if (row == col) {\n      L[row * N + row] = 1.0;\n      U[row * N + col] = val;\n   } else if (row < col) {\n      L[row * N + col] = val / U[col * N + col];\n      U[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] -= L[row * N + row] * val;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int bid_size = gridDim.x;\n   int num_threads = blockDim.x;\n   int start = tid + bid * num_threads;\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = A[i].value / U[A[i].column];\n         U[i] = 0.0;\n      }\n   }\n   __syncthreads();\n   for (int i = start; i < sizeA; i += bid_size * num_threads) {\n      if (A[i].row > A[i].column) {\n         U[i] -= L[i] * U[A[i].column];\n      } else {\n         L[i] -= L[i] * U[A[i].column];\n      }\n   }\n}\n",
+            "\n\n   int i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i < sizeA) {\n      COOElement e = A[i];\n      if (e.row == e.column) {\n         L[e.row * N + e.column] = 1.0;\n         U[e.row * N + e.column] = e.value;\n         for (int k = e.row + 1; k < N; k++) {\n            L[k * N + e.column] = A[i].value / e.value;\n            U[e.row * N + k] -= L[k * N + e.column] * e.value;\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t tid = threadIdx.x;\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1;\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int block = blockIdx.x;\n   int stride = blockDim.x;\n   for (size_t i = tid; i < sizeA; i += stride) {\n      if (A[i].row == A[i].column) {\n         L[i] = 1.0;\n         U[i] = A[i].value;\n      } else {\n         L[i] = 0.0;\n         U[i] = 0.0;\n      }\n   }\n   for (size_t k = 0; k < N; k++) {\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column == k) {\n               L[j] = A[j].value / U[k];\n               U[j] = 0.0;\n            }\n         }\n         for (size_t j = tid; j < sizeA; j += stride) {\n            if (A[j].row == i && A[j].column > k) {\n               U[j] -= L[j] * U[A[j].column];\n            }\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else {\n         L[tid] = 0.0;\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t i = A[tid].row;\n   size_t j = A[tid].column;\n   double v = A[tid].value;\n\n   if (i == j) {\n      L[i * N + i] = 1.0;\n      U[i * N + j] = v;\n   } else if (i < j) {\n      L[i * N + j] = v;\n   } else {\n      U[i * N + j] = v;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[col * N + row];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[row * N + row] = 1.0;\n      U[row * N + column] = value;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n      U[row * N + column] = 0.0;\n   } else {\n      U[row * N + column] -= L[row * N + row] * value;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid] = 1;\n         U[tid] = A[tid].value;\n      } else if (A[tid].row < A[tid].column) {\n         L[tid] = 0;\n         U[tid] = A[tid].value;\n      } else {\n         L[tid] = A[tid].value / U[A[tid].column];\n         U[tid] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      const COOElement &a = A[tid];\n      if (a.row == a.column) {\n         L[a.row * N + a.column] = 1.0;\n         U[a.row * N + a.column] = a.value;\n      } else if (a.row < a.column) {\n         L[a.row * N + a.column] = 0.0;\n         U[a.row * N + a.column] = a.value;\n      } else {\n         L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n         U[a.row * N + a.column] = 0.0;\n         for (int k = a.column + 1; k < a.row; k++) {\n            U[a.row * N + k] -= L[a.row * N + a.column] * U[a.column * N + k];\n         }\n      }\n   }\n}\n",
+            "\n\n   size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n   size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row].value;\n      } else if (row < col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row].value;\n      } else {\n         L[row * N + col] = A[row].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n   size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n   if (i >= sizeA) return;\n   if (A[i].row == A[i].column) {\n      L[i] = 1.0;\n      U[i] = A[i].value;\n   } else {\n      L[i] = 0.0;\n      U[i] = 0.0;\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = 0.0;\n      }\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row && A[i].column == col) {\n            U[row * N + col] = A[i].value;\n         } else if (A[i].row == row) {\n            L[row * N + col] -= A[i].value * U[A[i].column * N + col];\n         } else if (A[i].column == col) {\n            U[row * N + col] -= A[i].value * L[row * N + A[i].row];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n   __syncthreads();\n   for (int k = 0; k < row; k++) {\n      if (U[k * N + col] != 0.0) {\n         L[row * N + k] = A[row * N + k].value / U[k * N + col];\n         for (int j = col; j < N; j++) {\n            U[row * N + j] -= L[row * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      if (A[tid].row == A[tid].column) {\n         L[tid * N + A[tid].column] = 1;\n         U[tid * N + A[tid].column] = A[tid].value;\n      } else {\n         L[tid * N + A[tid].column] = 0;\n         U[tid * N + A[tid].column] = A[tid].value;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int i = A[tid].row;\n   int j = A[tid].column;\n   double v = A[tid].value;\n   if (i == j) {\n      L[i * N + i] = 1;\n      U[i * N + i] = v;\n   } else if (i < j) {\n      L[i * N + j] = v / U[j * N + j];\n      U[i * N + j] = 0;\n   } else {\n      U[i * N + j] -= L[i * N + i] * v;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   const COOElement &a = A[tid];\n   if (a.row == a.column) {\n      L[a.row * N + a.column] = 1.0;\n      U[a.row * N + a.column] = a.value;\n   } else if (a.row > a.column) {\n      L[a.row * N + a.column] = a.value / U[a.column * N + a.column];\n      U[a.row * N + a.column] = 0.0;\n   } else {\n      U[a.row * N + a.column] = a.value;\n      L[a.row * N + a.column] = 0.0;\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) return;\n\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n\n   if (row == column) {\n      L[tid] = 1.0;\n      U[tid] = value;\n   } else if (row < column) {\n      L[tid] = 0.0;\n      U[tid] = value;\n   } else {\n      L[tid] = value / U[column * N + column];\n      U[tid] = 0.0;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element in column k\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows k and pivot_row in L and U\n      for (size_t j = 0; j < N; ++j) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Update the remaining rows in column k\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results from all ranks on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; ++j) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv, U_recv;\n         MPI::COMM_WORLD.Recv(L_recv.data(), L_recv.size(), MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), U_recv.size(), MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), L.size(), MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), U.size(), MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Compute L and U in parallel\n   for (size_t i = 0; i < N; ++i) {\n      // Compute L(i, :)\n      for (size_t j = 0; j < i; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < j; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         L[i * N + j] = (A[i].value - sum) / U[j * N + j];\n      }\n\n      // Compute U(i, :)\n      for (size_t j = i; j < N; ++j) {\n         double sum = 0.0;\n         for (size_t k = 0; k < i; ++k) {\n            sum += L[i * N + k] * U[k * N + j];\n         }\n         U[i * N + j] = A[i].value - sum;\n      }\n   }\n\n   // Gather L and U on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      MPI::COMM_WORLD.Gather(&L[0], N * N, MPI::DOUBLE, &L[0], N * N, MPI::DOUBLE, 0);\n      MPI::COMM_WORLD.Gather(&U[0], N * N, MPI::DOUBLE, &U[0], N * N, MPI::DOUBLE, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n         }\n      }\n   }\n\n   // Copy the upper triangular matrix to U\n   for (size_t i = 0; i < N; i++) {\n      for (size_t j = i; j < N; j++) {\n         U[i * N + j] = A[i * N + j].value;\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A and store the result in L\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k; j < N; ++j) {\n            L[i * N + k] -= U[i * N + j] * U[k * N + j];\n         }\n      }\n\n      // For each column j > k, subtract the k-th column of L from the j-th column of A and store the result in U\n      for (size_t j = k + 1; j < N; ++j) {\n         for (size_t i = k; i < N; ++i) {\n            U[k * N + j] -= L[i * N + k] * U[i * N + j];\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; j++) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   std::vector<COOElement> L(A);\n   std::vector<COOElement> U(A);\n\n   // Perform Gaussian elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the pivot row\n      MPI_Bcast(&A[k], sizeof(COOElement) * A[k].size(), MPI_BYTE, k, MPI_COMM_WORLD);\n\n      // Eliminate the pivot column in the remaining rows\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = A[i][k].value / A[k][k].value;\n         for (size_t j = k; j < N; ++j) {\n            A[i][j].value -= factor * A[k][j].value;\n         }\n      }\n\n      // Update L and U\n      L[k].value = 1.0;\n      for (size_t j = k + 1; j < N; ++j) {\n         L[j].value = A[j][k].value;\n         U[k].value = A[k][j].value;\n      }\n   }\n\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each row i > k, subtract the k-th row of U from the i-th row of A\n      for (size_t i = k + 1; i < N; ++i) {\n         double factor = U[i * N + k] / U[k * N + k];\n         for (size_t j = k; j < N; ++j) {\n            U[i * N + j] -= factor * U[k * N + j];\n         }\n         L[i * N + k] = factor;\n      }\n   }\n\n   // Gather the L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot row\n      size_t pivot_row = k;\n      for (size_t i = k + 1; i < N; i++) {\n         if (A[i * N + k] > A[pivot_row * N + k]) {\n            pivot_row = i;\n         }\n      }\n\n      // Swap the pivot row with the k-th row in L and U\n      for (size_t j = 0; j < N; j++) {\n         std::swap(L[k * N + j], L[pivot_row * N + j]);\n         std::swap(U[k * N + j], U[pivot_row * N + j]);\n      }\n\n      // Eliminate the k-th column in the remaining rows\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k] / A[k * N + k];\n         L[i * N + k] = factor;\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= factor * A[k * N + j];\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(A.size());\n   U.resize(A.size());\n   for (size_t i = 0; i < A.size(); ++i) {\n      L[i].row = A[i].row;\n      L[i].column = A[i].column;\n      L[i].value = 0;\n      U[i].row = A[i].row;\n      U[i].column = A[i].column;\n      U[i].value = 0;\n   }\n\n   // Perform LU factorization\n   for (size_t i = 0; i < N; ++i) {\n      // Update L\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row == i && A[j].column < i) {\n            L[j].value = A[j].value / U[j].value;\n         }\n      }\n\n      // Update U\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row < i && A[j].column == i) {\n            U[j].value = A[j].value / L[j].value;\n         }\n      }\n\n      // Update A\n      for (size_t j = 0; j < A.size(); ++j) {\n         if (A[j].row >= i && A[j].column >= i) {\n            A[j].value -= L[j].value * U[j].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n         std::vector<COOElement> recvL(A.size());\n         std::vector<COOElement> recvU(A.size());\n         MPI::COMM_WORLD.Recv(recvL.data(), A.size(), MPI_DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(recvU.data(), A.size(), MPI_DOUBLE, i, 0);\n         for (size_t j = 0; j < A.size(); ++j) {\n            L[j].value += recvL[j].value;\n            U[j].value += recvU[j].value;\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), A.size(), MPI_DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), A.size(), MPI_DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < A.size(); i++) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A\n      std::swap(A[k], A[pivot_row]);\n\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            U[k * N + j] -= multiplier * A[i].value;\n         }\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n\n   // Initialize the diagonal of L to 1\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n\n   // Iterate over the rows of A\n   for (size_t k = 0; k < A.size(); k++) {\n      // Get the row, column, and value of the current element\n      size_t row = A[k].row;\n      size_t column = A[k].column;\n      double value = A[k].value;\n\n      // Update the elements of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = value / U[k * N + k];\n         U[i * N + column] -= L[i * N + k] * U[k * N + column];\n      }\n   }\n}\n",
+            "\n\n  // Initialize L and U\n  L.resize(N * N, 0);\n  U.resize(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    L[i * N + i] = 1;\n    U[i * N + i] = A[i].value;\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row of U\n    MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each row i > k\n    for (size_t i = k + 1; i < N; i++) {\n      // Calculate the multiplier\n      double multiplier = A[i].value / U[k * N + k];\n\n      // Update the i-th row of L\n      for (size_t j = 0; j < N; j++) {\n        L[i * N + j] += multiplier * U[k * N + j];\n      }\n\n      // Update the i-th row of U\n      for (size_t j = k + 1; j < N; j++) {\n        U[i * N + j] -= multiplier * U[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = A[k].value;\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > std::abs(pivot_value)) {\n            pivot_row = i;\n            pivot_value = A[i * N + k].value;\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j].value, A[pivot_row * N + j].value);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Eliminate the elements below the pivot\n      for (size_t i = k + 1; i < N; i++) {\n         double factor = A[i * N + k].value / pivot_value;\n         L[i * N + k] = factor;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= factor * A[k * N + j].value;\n            U[i * N + j] = A[i * N + j].value;\n         }\n      }\n   }\n\n   // Gather the results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n         std::vector<double> L_recv(N * N, 0.0);\n         std::vector<double> U_recv(N * N, 0.0);\n         MPI::COMM_WORLD.Recv(L_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         MPI::COMM_WORLD.Recv(U_recv.data(), N * N, MPI::DOUBLE, i, 0);\n         for (size_t j = 0; j < N * N; j++) {\n            L[j] += L_recv[j];\n            U[j] += U_recv[j];\n         }\n      }\n   } else {\n      MPI::COMM_WORLD.Send(L.data(), N * N, MPI::DOUBLE, 0, 0);\n      MPI::COMM_WORLD.Send(U.data(), N * N, MPI::DOUBLE, 0, 0);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and the remaining rows of U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n\n   // Gather the updated L and U matrices from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Broadcast the k-th row of U to all ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n      // For each rank, update the k-th row of L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "<,",
+            "_t* 0."
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "struct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format with sizeA elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least sizeA threads.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\n__global__ void luFactorize(const COOElement *A, size_t sizeA, double *L, double *U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n   int row = blockIdx.x;\n   int col = threadIdx.x;\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      U[row * N + col] = A[row * N + col].value;\n      for (int k = 0; k < row; k++) {\n         U[row * N + col] -= L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = U[row * N + col] / U[col * N + col];\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row < N) {\n      double sum = 0;\n      for (size_t i = 0; i < sizeA; i++) {\n         if (A[i].row == row) {\n            for (size_t j = 0; j < i; j++) {\n               sum += L[row * N + j] * U[j * N + A[i].column];\n            }\n            U[row * N + A[i].column] = A[i].value - sum;\n         }\n         if (A[i].column == row) {\n            L[row * N + A[i].row] = A[i].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n         U[tid] = value;\n      } else if (row < column) {\n         L[tid] = 0.0;\n         U[tid] = value;\n      } else {\n         L[tid] = value / U[column * N + column];\n         U[tid] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + column] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n   size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid >= sizeA) {\n      return;\n   }\n   size_t row = A[tid].row;\n   size_t column = A[tid].column;\n   double value = A[tid].value;\n   if (row == column) {\n      L[row * N + row] = 1.0;\n   } else if (row < column) {\n      L[row * N + column] = value / U[column * N + column];\n   } else {\n      U[row * N + column] = value;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = block.y * block.x + thread.y * N + thread.x;\n   if (row < sizeA) {\n      if (A[row].row == A[row].column) {\n         L[A[row].row * N + A[row].column] = 1.0;\n         U[A[row].row * N + A[row].column] = A[row].value;\n      } else {\n         L[A[row].row * N + A[row].column] = A[row].value / U[A[row].column * N + A[row].column];\n         U[A[row].row * N + A[row].column] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.y * blockDim.y + threadIdx.y;\n   int col = blockIdx.x * blockDim.x + threadIdx.x;\n   if (row >= N || col >= N) {\n      return;\n   }\n   if (row == col) {\n      L[row * N + col] = 1.0;\n   } else if (row > col) {\n      L[row * N + col] = 0.0;\n   } else {\n      double sum = 0.0;\n      for (int k = col; k < N; k++) {\n         sum += L[row * N + k] * U[k * N + col];\n      }\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row < col) {\n         L[row * N + col] = 0.0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0.0;\n         for (size_t k = col + 1; k < N; k++) {\n            U[row * N + k] -= L[row * N + col] * U[col * N + k];\n         }\n      }\n   }\n}\n",
+            "\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else if (row > col) {\n         L[row * N + col] = 0.0;\n      } else {\n         double sum = 0.0;\n         for (size_t k = 0; k < sizeA; k++) {\n            if (A[k].row == row && A[k].column == col) {\n               sum += A[k].value * U[col * N + A[k].column];\n            }\n         }\n         L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n   if (row >= N || col >= N) return;\n\n   double sum = 0;\n   for (size_t k = 0; k < sizeA; k++) {\n      if (A[k].row == row && A[k].column < col) {\n         sum += L[row * N + A[k].column] * U[A[k].column * N + col];\n      }\n   }\n   if (row == col) {\n      L[row * N + col] = 1;\n      U[row * N + col] = A[row * N + col] - sum;\n   } else {\n      L[row * N + col] = (A[row * N + col] - sum) / U[col * N + col];\n      U[row * N + col] = 0;\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1.0;\n      } else {\n         L[row * N + col] = 0.0;\n      }\n      U[row * N + col] = A[row * N + col].value;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[tid] = 1.0;\n      } else if (row < column) {\n         L[tid] = 0.0;\n      } else {\n         U[tid] = value / L[column * N + column];\n      }\n   }\n}\n",
+            "\n\n   int row = blockIdx.x * blockDim.x + threadIdx.x;\n   int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n   if (row < N && col < N) {\n      if (row == col) {\n         L[row * N + col] = 1;\n         U[row * N + col] = A[row * N + col].value;\n      } else if (row > col) {\n         L[row * N + col] = 0;\n         U[row * N + col] = A[row * N + col].value;\n      } else {\n         L[row * N + col] = A[row * N + col].value / U[col * N + col];\n         U[row * N + col] = 0;\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n         U[row * N + column] = value;\n      } else if (row < column) {\n         L[row * N + column] = value;\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = 0; i < sizeA; ++i) {\n      const auto row = A[i].row;\n      const auto column = A[i].column;\n      const auto value = A[i].value;\n      if (row == column) {\n         L[row * N + column] = 1;\n      } else if (row < column) {\n         U[row * N + column] = value;\n      } else {\n         L[row * N + column] = value;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < sizeA) {\n      size_t row = A[tid].row;\n      size_t column = A[tid].column;\n      double value = A[tid].value;\n      if (row == column) {\n         L[row * N + row] = 1.0;\n      } else if (row < column) {\n         L[row * N + column] = value / U[column * N + column];\n      } else {\n         U[row * N + column] = value;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "49_sparse_la_sparse_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct COOElement {\n   size_t row, column;\n   double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   A is a sparse NxN matrix stored in COO format.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in L and U on rank 0.\n   Example:\n\n   input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n   output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector<COOElement> const& A, std::vector<double> &L, std::vector<double> &U, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0);\n   U.resize(N * N, 0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Initialize the diagonal elements of L to 1\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update the k-th row of L and U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; j++) {\n         L[k * N + j] = A[k].value / U[k * N + k];\n         for (size_t i = k + 1; i < N; i++) {\n            U[i * N + j] -= L[k * N + j] * A[i].value;\n         }\n      }\n\n      // Update the remaining rows of L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         for (size_t j = k + 1; j < N; j++) {\n            L[i * N + j] -= L[i * N + k] * L[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Factorize A\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); i++) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i].value -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Update U\n      #pragma omp parallel for\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k].value / L[k * N + k];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(&L[0], N * N, MPI_DOUBLE, &L[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(&U[0], N * N, MPI_DOUBLE, &U[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; ++j) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U from all ranks to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; ++i) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform Gauss elimination\n   for (size_t k = 0; k < N; ++k) {\n      // Rank 0 broadcasts the k-th row of U to all other ranks\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n      // Rank 0 updates the k-th row of L\n      for (size_t j = k + 1; j < N; ++j) {\n         L[k * N + j] = A[j].value / U[k * N + k];\n      }\n\n      // All ranks update the remaining rows of U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; ++i) {\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[k * N + j] * U[k * N + i];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k * N + k].value);\n      for (size_t i = k + 1; i < N; i++) {\n         if (std::abs(A[i * N + k].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i * N + k].value);\n         }\n      }\n\n      // Swap rows if necessary\n      if (pivot_row != k) {\n         for (size_t j = 0; j < N; j++) {\n            std::swap(A[k * N + j], A[pivot_row * N + j]);\n            std::swap(L[k * N + j], L[pivot_row * N + j]);\n         }\n      }\n\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         double multiplier = A[i * N + k].value / A[k * N + k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; j++) {\n            A[i * N + j].value -= multiplier * A[k * N + j].value;\n            U[k * N + j].value += multiplier * A[i * N + j].value;\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; i++) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   #pragma omp parallel for\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i * N + k] / A[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            A[i * N + j] -= L[i * N + k] * A[k * N + j];\n         }\n      }\n      for (size_t j = k + 1; j < N; j++) {\n         U[k * N + j] = A[k * N + j];\n      }\n   }\n\n   // Gather L and U to rank 0\n   MPI_Gather(L.data(), N * N, MPI_DOUBLE, L.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   MPI_Gather(U.data(), N * N, MPI_DOUBLE, U.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Find the pivot element\n      size_t pivot_row = k;\n      double pivot_value = std::abs(A[k].value);\n      for (size_t i = k + 1; i < N; ++i) {\n         if (std::abs(A[i].value) > pivot_value) {\n            pivot_row = i;\n            pivot_value = std::abs(A[i].value);\n         }\n      }\n\n      // Swap the pivot row with the k-th row in A, L, and U\n      if (pivot_row != k) {\n         std::swap(A[k], A[pivot_row]);\n         for (size_t i = 0; i < N; ++i) {\n            std::swap(L[k * N + i], L[pivot_row * N + i]);\n            std::swap(U[k * N + i], U[pivot_row * N + i]);\n         }\n      }\n\n      // Eliminate the elements below the pivot element\n      for (size_t i = k + 1; i < N; ++i) {\n         double multiplier = A[i].value / A[k].value;\n         L[i * N + k] = multiplier;\n         for (size_t j = k; j < N; ++j) {\n            A[i].value -= multiplier * A[k].value;\n            U[i * N + j] -= multiplier * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n   for (size_t i = 0; i < N; ++i) {\n      L[i * N + i] = 1.0;\n   }\n   for (size_t i = 0; i < A.size(); ++i) {\n      U[A[i].row * N + A[i].column] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; ++k) {\n      // Update L and U on rank 0\n      for (size_t i = k + 1; i < N; ++i) {\n         L[i * N + k] = U[i * N + k] / U[k * N + k];\n         for (size_t j = k + 1; j < N; ++j) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n\n      // Broadcast L and U to other ranks\n      MPI_Bcast(&L[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Bcast(&U[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Initialize L and U\n   L.resize(N * N, 0.0);\n   U.resize(N * N, 0.0);\n\n   // Copy the diagonal elements of A to U\n   #pragma omp parallel for\n   for (size_t i = 0; i < N; i++) {\n      U[i * N + i] = A[i].value;\n   }\n\n   // Perform LU factorization\n   for (size_t k = 0; k < N; k++) {\n      // Update L and U\n      #pragma omp parallel for\n      for (size_t i = k + 1; i < N; i++) {\n         L[i * N + k] = A[i].value / U[k * N + k];\n         for (size_t j = k + 1; j < N; j++) {\n            U[i * N + j] -= L[i * N + k] * U[k * N + j];\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, Kokkos::View<const Element*> &x, Kokkos::View<const Element*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA (const int i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(\"sparse_py\", 0, z.extent(0), KOKKOK_LAMBDA(size_t i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  // Create a Kokkos kernel to perform the sparse axpy operation\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x(j).index == i) sum += alpha * x(j).value;\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y(j).index == i) sum += y(j).value;\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); j++) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); j++) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  const size_t num_elements_x = x.extent(0);\n  const size_t num_elements_y = y.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements_x + num_elements_y),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         if (i < num_elements_x) {\n                           const size_t index = x(i).index;\n                           z(index) += alpha * x(i).value;\n                         } else {\n                           const size_t index = y(i - num_elements_x).index;\n                           z(index) += y(i - num_elements_x).value;\n                         }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA(const int i, double &sum) {\n    sum += alpha * x(i).value + y(i).value;\n  }, z(0));\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, z.extent(0)), KOKKOS_LAMBDA (const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.extent(0); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sparseAxpy\", z.size(), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j).index == i) {\n        sum += alpha * x(j).value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y(j).index == i) {\n        sum += y(j).value;\n      }\n    }\n    z(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t m = y.size();\n  z.resize(std::max(n, m));\n  #pragma omp parallel for\n  for (size_t i = 0; i < std::max(n, m); i++) {\n    z[i] = 0;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < m; i++) {\n    z[y[i].index] += y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (auto const& e : x) {\n      if (e.index == i) {\n        z[i] += alpha * e.value;\n      }\n    }\n    for (auto const& e : y) {\n      if (e.index == i) {\n        z[i] += e.value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); i++) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    z[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "serial",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      i++;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      j++;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back(alpha * x[i].value);\n      i++;\n    } else {\n      z.push_back(y[j].value);\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(alpha * x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  auto itz = z.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index < ity->index) {\n      *itz = itx->value;\n      ++itx;\n    } else if (itx->index > ity->index) {\n      *itz = ity->value;\n      ++ity;\n    } else {\n      *itz = alpha * itx->value + ity->value;\n      ++itx;\n      ++ity;\n    }\n    ++itz;\n  }\n  while (itx != x.end()) {\n    *itz = itx->value;\n    ++itx;\n    ++itz;\n  }\n  while (ity != y.end()) {\n    *itz = ity->value;\n    ++ity;\n    ++itz;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": ".",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Initialize z with zeros\n  z.assign(z.size(), 0);\n\n  // Iterate over x and y\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    // If the indices match, add the values and increment both i and j\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    }\n    // If the index of x is smaller, add the value to z and increment i\n    else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    }\n    // If the index of y is smaller, add the value to z and increment j\n    else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n\n  // Add the remaining elements of x\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n\n  // Add the remaining elements of y\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.resize(x.size() + y.size());\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value;\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z[k].index = y[j].index;\n      z[k].value = y[j].value;\n      ++j;\n    } else {\n      z[k].index = x[i].index;\n      z[k].value = alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n    ++k;\n  }\n  while (i < x.size()) {\n    z[k].index = x[i].index;\n    z[k].value = alpha * x[i].value;\n    ++i;\n    ++k;\n  }\n  while (j < y.size()) {\n    z[k].index = y[j].index;\n    z[k].value = y[j].value;\n    ++j;\n    ++k;\n  }\n  z.resize(k);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t m = z.size();\n  for (size_t i = 0, j = 0, k = 0; i < m; ++i) {\n    if (j < x.size() && x[j].index == i) {\n      z[i] += alpha * x[j].value;\n      ++j;\n    }\n    if (k < y.size() && y[k].index == i) {\n      z[i] += y[k].value;\n      ++k;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0, j = 0; i < x.size() || j < y.size();) {\n    if (j == y.size() || (i < x.size() && x[i].index < y[j].index)) {\n      z[x[i].index] += alpha * x[i].value;\n      ++i;\n    } else if (i == x.size() || (j < y.size() && y[j].index < x[i].index)) {\n      z[y[j].index] += y[j].value;\n      ++j;\n    } else {\n      z[x[i].index] += alpha * x[i].value + y[j].value;\n      ++i;\n      ++j;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value + y[j].value});\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z.push_back({x[i].index, alpha * x[i].value});\n      i++;\n    } else {\n      z.push_back({y[j].index, y[j].value});\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back({x[i].index, alpha * x[i].value});\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back({y[j].index, y[j].value});\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index == y[j].index) {\n      z[x[i].index] = alpha * x[i].value + y[j].value;\n      i++;\n      j++;\n    } else if (x[i].index < y[j].index) {\n      z[x[i].index] = alpha * x[i].value;\n      i++;\n    } else {\n      z[y[j].index] = y[j].value;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z[x[i].index] = alpha * x[i].value;\n    i++;\n  }\n  while (j < y.size()) {\n    z[y[j].index] = y[j].value;\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      i++;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      j++;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      i++;\n      j++;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    i++;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    j++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t x_size = x.size();\n  size_t y_size = y.size();\n  size_t z_size = 0;\n  size_t x_index = 0;\n  size_t y_index = 0;\n  while (x_index < x_size && y_index < y_size) {\n    if (x[x_index].index == y[y_index].index) {\n      z.push_back(alpha * x[x_index].value + y[y_index].value);\n      z_size++;\n      x_index++;\n      y_index++;\n    } else if (x[x_index].index < y[y_index].index) {\n      z.push_back(alpha * x[x_index].value);\n      z_size++;\n      x_index++;\n    } else {\n      z.push_back(y[y_index].value);\n      z_size++;\n      y_index++;\n    }\n  }\n  while (x_index < x_size) {\n    z.push_back(alpha * x[x_index].value);\n    z_size++;\n    x_index++;\n  }\n  while (y_index < y_size) {\n    z.push_back(y[y_index].value);\n    z_size++;\n    y_index++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  size_t i = 0, j = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      z.push_back(x[i].value);\n      ++i;\n    } else if (x[i].index > y[j].index) {\n      z.push_back(y[j].value);\n      ++j;\n    } else {\n      z.push_back(alpha * x[i].value + y[j].value);\n      ++i;\n      ++j;\n    }\n  }\n  while (i < x.size()) {\n    z.push_back(x[i].value);\n    ++i;\n  }\n  while (j < y.size()) {\n    z.push_back(y[j].value);\n    ++j;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& e : x) {\n    z[e.index] += alpha * e.value;\n  }\n  for (auto const& e : y) {\n    z[e.index] += e.value;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  z.clear();\n  z.reserve(x.size() + y.size());\n  auto itx = x.begin();\n  auto ity = y.begin();\n  while (itx != x.end() && ity != y.end()) {\n    if (itx->index == ity->index) {\n      z.push_back(alpha * itx->value + ity->value);\n      ++itx;\n      ++ity;\n    } else if (itx->index < ity->index) {\n      z.push_back(alpha * itx->value);\n      ++itx;\n    } else {\n      z.push_back(ity->value);\n      ++ity;\n    }\n  }\n  while (itx != x.end()) {\n    z.push_back(alpha * itx->value);\n    ++itx;\n  }\n  while (ity != y.end()) {\n    z.push_back(ity->value);\n    ++ity;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "hip",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < Nx) {\n    size_t index = x[tid].index;\n    z[index] = alpha * x[tid].value + y[index];\n  } else if (tid < Nx + Ny) {\n    size_t index = y[tid - Nx].index;\n    z[index] = alpha * 0 + y[tid - Nx].value;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = 0;\n\t\tfor (size_t j = 0; j < Nx; j++) {\n\t\t\tif (x[j].index == i) {\n\t\t\t\tz[i] += alpha * x[j].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t j = 0; j < Ny; j++) {\n\t\t\tif (y[j].index == i) {\n\t\t\t\tz[i] += y[j].value;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int i;\n  for (i = tid; i < N; i += blockDim.x * gridDim.x) {\n    if (i < Nx) {\n      z[x[i].index] += alpha * x[i].value;\n    }\n    if (i < Ny) {\n      z[y[i].index] += y[i].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) val += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) val += y[i].value;\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double val = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        val += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        val += y[i].value;\n      }\n    }\n    z[tid] = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < Nx) {\n\t\tz[x[tid].index] += alpha * x[tid].value;\n\t}\n\telse if (tid < Nx + Ny) {\n\t\tz[y[tid - Nx].index] += y[tid - Nx].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) z[i] += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < N; j++) {\n      if (y[j].index == i) z[i] += y[j].value;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n\tsize_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tdouble sum = 0;\n\t\tfor (size_t i = 0; i < Nx; i++) {\n\t\t\tif (x[i].index == tid) {\n\t\t\t\tsum += alpha * x[i].value;\n\t\t\t}\n\t\t}\n\t\tfor (size_t i = 0; i < Ny; i++) {\n\t\t\tif (y[i].index == tid) {\n\t\t\t\tsum += y[i].value;\n\t\t\t}\n\t\t}\n\t\tz[tid] = sum;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& e : x) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  for (auto const& e : y) {\n    z_size = std::max(z_size, e.index + 1);\n  }\n  \n  // Allocate memory for the output vector\n  z.resize(z_size);\n  \n  // Compute the output vector on rank 0\n  if (rank == 0) {\n    for (auto const& e : x) {\n      z[e.index] += alpha * e.value;\n    }\n    for (auto const& e : y) {\n      z[e.index] += e.value;\n    }\n  }\n  \n  // Broadcast the output vector to all other ranks\n  MPI_Bcast(&z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result on rank 0\n  if (my_rank == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Compute the size of the result vector\n  size_t zSize = 0;\n  for (auto const& element : x) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    zSize = std::max(zSize, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(zSize);\n\n  // Compute the result vector\n  for (auto const& element : x) {\n    z[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    z[element.index] += element.value;\n  }\n\n  // Gather the result vector on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      std::vector<double> recvBuffer;\n      MPI_Recv(&recvBuffer[0], recvBuffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recvBuffer.size(); ++j) {\n        z[j] += recvBuffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  std::vector<Element> local_x, local_y;\n  \n  // Scatter x and y to all ranks\n  MPI_Scatter(&x[0], x.size(), MPI_DOUBLE, &local_x[0], local_x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], y.size(), MPI_DOUBLE, &local_y[0], local_y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  \n  // Compute local z\n  std::vector<double> local_z(z.size(), 0);\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_z[local_x[i].index] += alpha * local_x[i].value;\n  }\n  for (size_t i = 0; i < local_y.size(); ++i) {\n    local_z[local_y[i].index] += local_y[i].value;\n  }\n  \n  // Gather local z to rank 0\n  MPI_Gather(&local_z[0], local_z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the number of elements in the sparse vector\n  size_t num_elements = x.size();\n\n  // Create a vector of zeros to store the result\n  z.resize(num_elements);\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = x[i].index;\n    double value = x[i].value;\n\n    // Add the value of the element to the result\n    z[index] += alpha * value;\n  }\n\n  // Loop over the elements in the sparse vector\n  for (size_t i = 0; i < num_elements; i++) {\n    // Get the index and value of the element\n    size_t index = y[i].index;\n    double value = y[i].value;\n\n    // Add the value of the element to the result\n    z[index] += value;\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size();\n\n  // Create a buffer to store the result on each rank\n  std::vector<double> local_z(num_elements, 0.0);\n\n  // Compute the local result\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_z.data(), z.data(), num_elements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result;\n\n  // Iterate over the elements in x and y\n  for (size_t i = 0; i < num_elements_x; i++) {\n    // Get the index and value of the current element in x\n    size_t index_x = x[i].index;\n    double value_x = x[i].value;\n\n    // Iterate over the elements in y\n    for (size_t j = 0; j < num_elements_y; j++) {\n      // Get the index and value of the current element in y\n      size_t index_y = y[j].index;\n      double value_y = y[j].value;\n\n      // If the indices of the current elements in x and y are equal, then add the values of the elements to the result\n      if (index_x == index_y) {\n        result.push_back({index_x, alpha * value_x + value_y});\n      }\n    }\n  }\n\n  // Sort the result vector by index\n  std::sort(result.begin(), result.end(), [](Element const& a, Element const& b) { return a.index < b.index; });\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(result.size());\n    for (size_t i = 0; i < result.size(); i++) {\n      z[i] = result[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  std::vector<double> partial_sums(z_size, 0.0);\n  for (auto const& element : x) {\n    partial_sums[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_sums[element.index] += element.value;\n  }\n\n  // Reduce the partial sums\n  std::vector<double> global_sums(z_size, 0.0);\n  MPI_Allreduce(partial_sums.data(), global_sums.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Store the result in z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(resultSize, 0);\n\n  // Compute the result on each rank\n  for (auto const& elementX : x) {\n    result[elementX.index] += alpha * elementX.value;\n  }\n  for (auto const& elementY : y) {\n    result[elementY.index] += elementY.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvBuffer(resultSize);\n      MPI_Recv(recvBuffer.data(), resultSize, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < resultSize; j++) {\n        result[j] += recvBuffer[j];\n      }\n    }\n    z = result;\n  } else {\n    MPI_Send(result.data(), resultSize, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(result_size, 0);\n  }\n\n  // Compute the partial result on each rank\n  std::vector<double> partial_result(result_size, 0);\n  for (auto const& element : x) {\n    partial_result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    partial_result[element.index] += element.value;\n  }\n\n  // Reduce the partial results to rank 0\n  MPI_Reduce(partial_result.data(), z.data(), result_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the partial sum on each rank\n  for (auto const& elem : x) {\n    z[elem.index] += alpha * elem.value;\n  }\n  for (auto const& elem : y) {\n    z[elem.index] += elem.value;\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the number of elements in the result vector\n  size_t num_elements = x.size() + y.size();\n  \n  // Allocate memory for the result vector\n  z.resize(num_elements);\n  \n  // Compute the local indices of the elements in the result vector\n  std::vector<size_t> local_indices(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_indices[i] = i;\n  }\n  \n  // Compute the local values of the elements in the result vector\n  std::vector<double> local_values(num_elements);\n  for (size_t i = 0; i < num_elements; ++i) {\n    local_values[i] = 0.0;\n  }\n  \n  // Compute the global indices of the elements in the result vector\n  std::vector<size_t> global_indices(num_elements);\n  MPI_Allgatherv(&local_indices[0], local_indices.size(), MPI_UNSIGNED_LONG, &global_indices[0], nullptr, nullptr, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n  \n  // Compute the global values of the elements in the result vector\n  std::vector<double> global_values(num_elements);\n  MPI_Allgatherv(&local_values[0], local_values.size(), MPI_DOUBLE, &global_values[0], nullptr, nullptr, MPI_DOUBLE, MPI_COMM_WORLD);\n  \n  // Compute the final values of the elements in the result vector\n  for (size_t i = 0; i < num_elements; ++i) {\n    z[global_indices[i]] = alpha * x[global_indices[i]].value + y[global_indices[i]].value;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  \n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(z_size, 0);\n  }\n  \n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n  \n  // Reduce the partial sums to rank 0\n  MPI_Reduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t result_size = 0;\n  for (auto const& element : x) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    result_size = std::max(result_size, element.index + 1);\n  }\n\n  // Create a vector to store the result on rank 0\n  std::vector<double> result(result_size, 0.0);\n\n  // Compute the result on each rank\n  for (auto const& element : x) {\n    result[element.index] += alpha * element.value;\n  }\n  for (auto const& element : y) {\n    result[element.index] += element.value;\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(result.data(), result_size, MPI_DOUBLE, z.data(), result_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  int num_elements = x.size();\n\n  // Create a buffer to store the results\n  std::vector<double> z(num_elements);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < num_elements; i++) {\n    z[i] = alpha * x[i].value * y[i].value;\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the local size of the output vector\n  size_t local_z_size = z_size / size;\n\n  // Compute the starting index of the local output vector\n  size_t local_z_start = rank * local_z_size;\n\n  // Compute the local output vector\n  for (size_t i = local_z_start; i < local_z_start + local_z_size; ++i) {\n    z[i] = 0;\n  }\n\n  // Compute the local contribution to the output vector\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the local output vectors to the global output vector\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n\n  // Only rank 0 stores the result\n  if (rank == 0) {\n    // Sort the output vector by index\n    std::sort(z.begin(), z.end(), [](const double& a, const double& b) { return a.index < b.index; });\n\n    // Remove duplicate elements from the output vector\n    z.erase(std::unique(z.begin(), z.end()), z.end());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Compute the size of the result vector\n  size_t resultSize = 0;\n  for (auto const& element : x) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n  for (auto const& element : y) {\n    resultSize = std::max(resultSize, element.index + 1);\n  }\n\n  // Create a vector of zeros on rank 0\n  if (rank == 0) {\n    z.resize(resultSize, 0);\n  }\n\n  // Compute the partial sums on each rank\n  for (auto const& element : x) {\n    if (rank == 0) {\n      z[element.index] += alpha * element.value;\n    }\n  }\n  for (auto const& element : y) {\n    if (rank == 0) {\n      z[element.index] += element.value;\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), resultSize, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n",
+            "\n\n  // Create a new vector z with the same size as x\n  z.resize(x.size());\n\n  // Loop over the elements of x and y and perform the operation z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i].index = x[i].index;\n    z[i].value = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a new vector z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z = alpha*x+y\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(&z[0], z.size(), MPI::DOUBLE, &z[0], z.size(), MPI::DOUBLE, 0);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "struct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors of size Nx and Ny. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x or y.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\n__global__ void sparseAxpy(double alpha, const Element *x, const Element *y, double *z, size_t Nx, size_t Ny, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) sum += alpha * x[j].value;\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) sum += y[j].value;\n    }\n    z[i] = sum;\n  }\n}\n",
+            "sparseSparseArray(1,",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) { sum += alpha * x[i].value; }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) { sum += y[i].value; }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) {\n        sum += y[i].value;\n      }\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n\tsize_t i = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (i < N) {\n\t\tz[i] = alpha * x[i].value + y[i].value;\n\t}\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tsize_t i = threadIdx.x;\n\tif (i < Nx) {\n\t\tz[x[i].index] += alpha * x[i].value;\n\t}\n\tif (i < Ny) {\n\t\tz[y[i].index] += y[i].value;\n\t}\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N) {\n\t\tz[tid].value = alpha * x[tid].value + y[tid].value;\n\t}\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    size_t i = tid;\n    double sum = 0;\n    for (size_t j = 0; j < Nx; j++) {\n      if (x[j].index == i) {\n        sum += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < Ny; j++) {\n      if (y[j].index == i) {\n        sum += y[j].value;\n      }\n    }\n    z[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid].value + y[tid].value;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == tid) sum += alpha * x[i].value;\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == tid) sum += y[i].value;\n    }\n    z[tid] = sum;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double sum = 0;\n    for (size_t i = 0; i < Nx; i++) {\n      if (x[i].index == idx) {\n        sum += alpha * x[i].value;\n      }\n    }\n    for (size_t i = 0; i < Ny; i++) {\n      if (y[i].index == idx) {\n        sum += y[i].value;\n      }\n    }\n    z[idx] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < Nx) {\n    atomicAdd(&z[x[tid].index], alpha * x[tid].value);\n  } else if (tid < Nx + Ny) {\n    atomicAdd(&z[y[tid - Nx].index], y[tid - Nx].value);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sparse_la",
+        "language": "cpp",
+        "name": "48_sparse_la_sparse_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Element {\n\tsize_t index;\n  double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n   output: z=[{3, 1}, {5, 10}, {7, 1}, {12, -1}]\n*/\nvoid sparseAxpy(double alpha, std::vector<Element> const& x, std::vector<Element> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the output vector\n  z.resize(z_size);\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the corresponding element in x\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the corresponding element in y\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the output value\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the output vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], recv_buffer.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buffer.size(); ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n      }\n    }\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buffer;\n      MPI_Recv(&recv_buffer[0], z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Loop over the elements in x and y and add them to the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; i++) {\n    if (i < num_elements_x) {\n      z[i] += alpha * x[i].value;\n    }\n    if (i < num_elements_y) {\n      z[i] += y[i].value;\n    }\n  }\n\n  // Reduce the result to rank 0\n  MPI_Reduce(&z[0], z.size(), MPI_DOUBLE, &z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize z on rank 0\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (rank == 0) {\n      z[i] = alpha * x[i].value + y[i].value;\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(z.data(), x.size(), MPI_DOUBLE, z.data(), x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  z.resize(num_elements_x + num_elements_y);\n\n  // Perform the sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x + num_elements_y; ++i) {\n    // Initialize the result to 0\n    z[i] = 0.0;\n\n    // Add the corresponding elements from x and y\n    for (size_t j = 0; j < num_elements_x; ++j) {\n      if (x[j].index == i) {\n        z[i] += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < num_elements_y; ++j) {\n      if (y[j].index == i) {\n        z[i] += y[j].value;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Initialize the output vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial sums\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value;\n  }\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); ++i) {\n    z[y[i].index] += y[i].value;\n  }\n\n  // Reduce the partial sums\n  MPI_Allreduce(MPI_IN_PLACE, z.data(), z_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Create a temporary vector to store the partial results on each rank\n  std::vector<double> partial_z(z.size(), 0.0);\n\n  // Compute the partial results in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < z.size(); ++i) {\n    for (auto& element : x) {\n      if (element.index == i) {\n        partial_z[i] += alpha * element.value;\n      }\n    }\n    for (auto& element : y) {\n      if (element.index == i) {\n        partial_z[i] += element.value;\n      }\n    }\n  }\n\n  // Reduce the partial results to rank 0 using MPI\n  MPI_Reduce(partial_z.data(), z.data(), z.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the output vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate the output vector on rank 0\n  if (rank == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the local size of the output vector\n  size_t local_z_size = (z_size + size - 1) / size;\n\n  // Allocate the local output vector\n  std::vector<double> local_z(local_z_size);\n\n  // Compute the local indices of the output vector\n  std::vector<size_t> local_z_indices(local_z_size);\n  for (size_t i = 0; i < local_z_size; i++) {\n    local_z_indices[i] = i + rank * local_z_size;\n  }\n\n  // Compute the local contributions to the output vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_z_size; i++) {\n    size_t index = local_z_indices[i];\n    double value = 0.0;\n    for (size_t j = 0; j < x.size(); j++) {\n      if (x[j].index == index) {\n        value += alpha * x[j].value;\n      }\n    }\n    for (size_t j = 0; j < y.size(); j++) {\n      if (y[j].index == index) {\n        value += y[j].value;\n      }\n    }\n    local_z[i] = value;\n  }\n\n  // Gather the local contributions to the output vector on rank 0\n  MPI_Gather(local_z.data(), local_z_size, MPI_DOUBLE, z.data(), local_z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t num_elements = x.size() + y.size();\n\n  // Create a vector to store the result\n  std::vector<Element> result(num_elements);\n\n  // Merge the two sparse vectors into the result vector\n  size_t i = 0, j = 0, k = 0;\n  while (i < x.size() && j < y.size()) {\n    if (x[i].index < y[j].index) {\n      result[k++] = x[i++];\n    } else if (x[i].index > y[j].index) {\n      result[k++] = y[j++];\n    } else {\n      result[k++] = {x[i].index, x[i++].value + y[j++].value};\n    }\n  }\n  while (i < x.size()) {\n    result[k++] = x[i++];\n  }\n  while (j < y.size()) {\n    result[k++] = y[j++];\n  }\n\n  // Reduce the result vector to rank 0\n  MPI_Reduce(result.data(), z.data(), num_elements, MPI_DOUBLE_INT, MPI_SUM, 0);\n}\n",
+            "\n\n  // Initialize z to 0\n  z.assign(x.size(), 0);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[x[i].index] += alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the result vector\n  size_t z_size = x.size() + y.size();\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma openmp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    // Find the index of the element in x that matches the index of the current element in z\n    auto it_x = std::find_if(x.begin(), x.end(), [&](Element const& element) { return element.index == i; });\n\n    // Find the index of the element in y that matches the index of the current element in z\n    auto it_y = std::find_if(y.begin(), y.end(), [&](Element const& element) { return element.index == i; });\n\n    // Compute the value of the current element in z\n    z[i] = (it_x != x.end() ? alpha * it_x->value : 0.0) + (it_y != y.end() ? it_y->value : 0.0);\n  }\n\n  // Gather the result vector on rank 0\n  MPI_Gather(&z[0], z_size, MPI_DOUBLE, &z[0], z_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vectors\n  size_t n = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(n);\n\n  // Compute the results in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(temp.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize z on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(x.size());\n  }\n\n  // Compute z on all ranks\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather z from all ranks to rank 0\n  MPI::COMM_WORLD.Gather(z.data(), z.size(), MPI_DOUBLE, z.data(), z.size(), MPI_DOUBLE, 0);\n}\n",
+            "\n\n  // Check if the rank is 0\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  if (rank == 0) {\n    // Initialize z with zeros\n    z.resize(x.size());\n    for (size_t i = 0; i < z.size(); i++) {\n      z[i] = 0;\n    }\n  }\n\n  // Perform sparse axpy operation in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (rank == 0) {\n      z[x[i].index] += alpha * x[i].value + y[i].value;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(temp.data(), numElements, MPI_DOUBLE, z.data(), numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of elements in the sparse vectors\n  size_t numElements = x.size();\n\n  // Create a temporary vector to store the results\n  std::vector<double> temp(numElements);\n\n  // Compute the result in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < numElements; i++) {\n    temp[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&temp[0], numElements, MPI_DOUBLE, &z[0], numElements, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast alpha to all ranks\n  MPI_Bcast(&alpha, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the size of the output vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the output vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    z.resize(z_size);\n  }\n\n  // Compute the output vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double value = 0.0;\n    for (auto const& element : x) {\n      if (element.index == i) {\n        value += alpha * element.value;\n      }\n    }\n    for (auto const& element : y) {\n      if (element.index == i) {\n        value += element.value;\n      }\n    }\n    if (MPI::COMM_WORLD.Get_rank() == 0) {\n      z[i] = value;\n    }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& elem : x) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n  for (auto const& elem : y) {\n    z_size = std::max(z_size, elem.index + 1);\n  }\n\n  // Allocate memory for the result vector\n  z.resize(z_size);\n\n  // Compute the result vector in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& elem : x) {\n      if (elem.index == i) {\n        x_value = elem.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& elem : y) {\n      if (elem.index == i) {\n        y_value = elem.value;\n        break;\n      }\n    }\n\n    // Compute the value of z at index i\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the result vector on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> recv_buf;\n      MPI_Recv(&recv_buf[0], recv_buf.size(), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_buf.size(); ++j) {\n        z[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&z[0], z.size(), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the result vector\n  size_t z_size = 0;\n  for (auto const& element : x) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n  for (auto const& element : y) {\n    z_size = std::max(z_size, element.index + 1);\n  }\n\n  // Initialize the result vector\n  z.resize(z_size, 0.0);\n\n  // Compute the partial result on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < z_size; ++i) {\n    double x_value = 0.0;\n    double y_value = 0.0;\n\n    // Find the value of x at index i\n    for (auto const& element : x) {\n      if (element.index == i) {\n        x_value = element.value;\n        break;\n      }\n    }\n\n    // Find the value of y at index i\n    for (auto const& element : y) {\n      if (element.index == i) {\n        y_value = element.value;\n        break;\n      }\n    }\n\n    // Compute the partial result\n    z[i] = alpha * x_value + y_value;\n  }\n\n  // Gather the partial results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::vector<double> partial_result(z_size);\n      MPI_Recv(partial_result.data(), z_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < z_size; ++j) {\n        z[j] += partial_result[j];\n      }\n    }\n  } else {\n    MPI_Send(z.data(), z_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of elements in x and y\n  size_t num_elements_x = x.size();\n  size_t num_elements_y = y.size();\n\n  // Create a vector to store the result\n  std::vector<double> result(num_elements_x);\n\n  // Compute z = alpha*x+y in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < num_elements_x; i++) {\n    result[i] = alpha * x[i].value + y[i].value;\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(result.data(), num_elements_x, MPI_DOUBLE, z.data(), num_elements_x, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(Kokkos::View<const char**> const& s, Kokkos::View<size_t[26]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < s.extent(0); ++j) {\n      if (s(j)[0] == 'a' + i) {\n        ++bins[i];\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, s.extent(0), [&](size_t i) {\n    char first = s(i)[0];\n    size_t index = first - 'a';\n    ++output(index);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, s.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[s(i)[0] - 'a']++;\n  });\n}\n",
+            "Counts, 0 0, 0,0,0, 0,0, ~:~:",
+            "\n\n  Kokkos::parallel_for(s.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    int idx = s(i)[0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      ++bins[idx];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(s(i)[0] - 'a')++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"firstLetterCounts\", s.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins[s[i][0] - 'a']++;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < 26; ++i) {\n    char c = 'a' + i;\n    for (auto const& str : s) {\n      if (str[0] == c) {\n        ++counts[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = static_cast<size_t>(s[i][0] - 'a');\n    #pragma omp atomic\n    bins[index]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    size_t index = s[i][0] - 'a';\n    ++ didik[index];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "serial",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116996854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105056804
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113708826,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0101489528
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116152604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103228962
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.011440698,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105789342
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116946096,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0107630386
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113572728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105094836
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0114826266,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105718166
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116121286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105544844
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116760226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103840996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0115970216,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104726362
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.011995586,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010592339
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0119027656,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102989554
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    ++bins[str[0] - 'a'];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.011661866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103582866
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (std::string const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    char first = str[0];\n    if (first >= 'a' && first <= 'z') {\n      bins[first - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.011588991,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0098978726
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    bins[str[0] - 'a']++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0113047302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0105880176
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0112150832,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102704514
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01127321,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0102118344
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0112377532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0103958566
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      bins[str[0] - 'a']++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.011460801,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.010413183
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++bins[str[0] - 'a'];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0116327102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0104958488
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "hip",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int idx = bid * hipBlockDim_x + tid;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    char first = s[tid][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid < N) {\n    char c = s[gid][0];\n    atomicAdd(&bins[c - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    char first = s[idx][0];\n    atomicAdd(&bins[first - 'a'], 1);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = tid; i < N; i += stride) {\n    bins[s[i][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    bins[s[gid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int index = s[gid][0] - 'a';\n    atomicAdd(&bins[index], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) return;\n  char first = s[idx][0];\n  int offset = first - 'a';\n  atomicAdd(&firstCounts[offset], 1);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    int bin = s[gid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    int idx = s[gid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            ": \u51a2,  Synag\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2 Saxe\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2\u51a2zernezerne.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (const auto& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet.\n  std::array<size_t, 26> local_bins;\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = s.size();\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (size_t i = 0; i < n; i++) {\n    if (rank == i % size) {\n      local_bins[s[i][0] - 'a']++;\n    }\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings starting with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) continue;\n    ++local_bins[str[0] - 'a'];\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk_size = s.size() / size;\n  int start = rank * chunk_size;\n  int end = (rank + 1) * chunk_size;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins = {};\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      ++local_bins[str[0] - 'a'];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\n__global__ void firstLetterCounts(const char **s, size_t N, size_t bins[26]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bins[s[tid][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int start = tid * 26 / N;\n  int end = (tid + 1) * 26 / N;\n  for (int i = start; i < end; i++) {\n    bins[i] = 0;\n  }\n  for (int i = 0; i < N; i++) {\n    int idx = s[i][0] - 'a';\n    if (idx >= 0 && idx < 26) {\n      atomicAdd(&bins[idx], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int firstLetter = s[idx][0] - 'a';\n    atomicAdd(&bins[firstLetter], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    char firstLetter = s[idx][0];\n    atomicAdd(&bins[firstLetter - 'a'], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int bin = s[tid][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = s[idx][0] - 'a';\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    bins[s[idx][0] - 'a']++;\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n  bins[s[idx][0] - 'a']++;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int idx = s[tid][0] - 'a';\n    atomicAdd(&bins[idx], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "23_histogram_first_letter_counts",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n   Assume all strings are in lower case. Store the output in `bins` array.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of s. The result is stored in bins on rank 0.\n   Example:\n\n   input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n   output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector<std::string> const& s, std::array<size_t, 26> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int chunk = s.size() / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = s.size();\n  }\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  for (int i = start; i < end; i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the first letter of its own strings\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = my_rank; i < s.size(); i += num_ranks) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the counts for a subset of the alphabet\n  int start = rank * 26 / size;\n  int end = (rank + 1) * 26 / size;\n\n  std::array<size_t, 26> local_bins;\n  local_bins.fill(0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); i++) {\n    int index = s[i][0] - 'a';\n    if (index >= start && index < end) {\n      local_bins[index]++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the first letters in its portion of the vector\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = rank; i < s.size(); i += size) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    char first_letter = s[i][0];\n    local_bins[first_letter - 'a']++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(std::begin( bins ), std::end( bins ), 0);\n\n  // Count the number of words starting with each letter in parallel\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < s.size(); ++i) {\n    ++ bins[s[i][0] - 'a'];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(MPI_IN_PLACE, 26, MPI_UNSIGNED_LONG, bins.data(), 26, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the first letter of the strings in its portion of the vector\n  std::array<size_t, 26> local_bins = {};\n  #pragma omp parallel for\n  for (size_t i = rank * s.size() / nprocs; i < (rank + 1) * s.size() / nprocs; ++i) {\n    local_bins[s[i][0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings in s that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (str.empty()) {\n      continue;\n    }\n    local_bins[str[0] - 'a']++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter\n  std::array<size_t, 26> local_bins{};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of strings that start with each letter in the alphabet\n  std::array<size_t, 26> local_bins = {};\n  for (auto const& str : s) {\n    if (!str.empty()) {\n      local_bins[str[0] - 'a']++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 26, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(Kokkos::View<const double*> const& x, Kokkos::View<size_t[10]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(i / 10) += (x(i) < 100) ? 1 : 0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += (x(i) >= 10 * i / 10 && x(i) < 10 * (i / 10 + 1));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(bins.size(), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] >= 10 * i && x[j] < 10 * (i + 1)) { bins[i] += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n    const double val = x(i);\n    update += (val >= 0 && val < 10) + (val >= 10 && val < 20) + (val >= 20 && val < 30) +\n              (val >= 30 && val < 40) + (val >= 40 && val < 50) + (val >= 50 && val < 60) +\n              (val >= 60 && val < 70) + (val >= 70 && val < 80) + (val >= 80 && val < 90) +\n              (val >= 90 && val < 100);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", bins.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[x(i) / 10] += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    bins(i / 10)++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t i) {\n    const double val = x(i);\n    bins(val / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"binsBy10Count\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    bins(x(i) / 10) += 1;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[i / 10] += 1;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t idx = static_cast<size_t>(x[i] / 10);\n    ++counts[idx];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    bins[bin]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)] += 1;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "serial",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0138528604,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0126016998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0134656134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0130965632
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0135997462,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0143909354
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::fill(\u5803.begin(), \u5803.end(), 0);\n  for (double const val : x) {\n    \u5803[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0136556456,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0150438622
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0135921058,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139239266
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132654852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0133793834
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0135179964,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0137199866
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double val : x) {\n    bins[static_cast<size_t>(val / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0142663388,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138673578
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.013297287,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0141462228
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0138952402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138263148
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    int i = std::min((int)(v - bins.front()), (int)bins.size() - 1);\n    bins[i]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0135468968,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0127476586
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0133075704,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0136491672
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.014044856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0138141036
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132442852,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013378979
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0140483864,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.013223502
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0132529038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0134229512
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0145642956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0140951006
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0140651482,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0139823524
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "hip",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    bins[x[i] / 10] += 1;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipGridDim_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    int bin = (int)(x[i] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    int bin = x[i] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = static_cast<int>(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = (int)(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    int bin = (int)x[tid] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  int start = idx * stride;\n  int end = min(start + stride, (int)N);\n  for (int i = start; i < end; i++) {\n    bins[(int)(x[i] / 10)]++;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), recv.size(), MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  for (double v : x) {\n    bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of values in each bin on each rank\n  std::array<size_t, 10> local_count = {};\n  for (auto v : x) {\n    local_count[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Gather the local counts into bins on rank 0\n  MPI_Gather(local_count.data(), 10, MPI_UNSIGNED_LONG, bins.data(), 10, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double xi : x) {\n    local_bins[static_cast<size_t>(xi / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ":",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double xi : x) {\n    int bin = static_cast<int>(xi / 10);\n    bins[bin]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> localBins{};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (auto v : x) {\n    bins[v / 10]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] >= 0 && x[i] < 10) {\n      bins[0]++;\n    } else if (x[i] >= 10 && x[i] < 20) {\n      bins[1]++;\n    } else if (x[i] >= 20 && x[i] < 30) {\n      bins[2]++;\n    } else if (x[i] >= 30 && x[i] < 40) {\n      bins[3]++;\n    } else if (x[i] >= 40 && x[i] < 50) {\n      bins[4]++;\n    } else if (x[i] >= 50 && x[i] < 60) {\n      bins[5]++;\n    } else if (x[i] >= 60 && x[i] < 70) {\n      bins[6]++;\n    } else if (x[i] >= 70 && x[i] < 80) {\n      bins[7]++;\n    } else if (x[i] >= 80 && x[i] < 90) {\n      bins[8]++;\n    } else if (x[i] >= 90 && x[i] < 100) {\n      bins[9]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local part of x.\n  std::array<size_t, 10> localBins = {};\n  for (double v : x) {\n    localBins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts.\n  MPI_Reduce(localBins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its subset of x\n  std::array<size_t, 10> local_bins = {};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local range.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double val : x) {\n    int bin = static_cast<int>(val / 10);\n    if (bin < 10) {\n      bins[bin]++;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> tmp;\n      MPI_Recv(tmp.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; j++) {\n        bins[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its own bins.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), recv_bins.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < recv_bins.size(); j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), local_bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "cuda",
+        "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is initialized with at least as many threads as values in x.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\n__global__ void binsBy10Count(const double *x, size_t N, size_t bins[10]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = static_cast<int>(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\nif (idx < N) {\n  int bin = x[idx] / 10;\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  int bin = (int)floor(x[idx] / 10);\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = x[idx] / 10;\n    atomicAdd(&bins[bin], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    int bin = (int)(x[idx] / 10);\n    atomicAdd(&bins[bin], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "21_histogram_bin_0-100",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Vector x contains values between 0 and 100, inclusive. Count the number of\n   values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Example:\n\n   input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n   output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector<double> const& x, std::array<size_t, 10> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of values in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins;\n  std::fill(local_bins.begin(), local_bins.end(), 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce local bins to global bins\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the bin size\n  const double bin_size = (x.back() - x.front()) / bins.size();\n\n  // Compute the local histogram\n  std::array<size_t, bins.size()> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    const double value = x[i];\n    const int bin_index = std::min<int>(static_cast<int>(bins.size() - 1),\n                                       static_cast<int>(std::floor((value - x.front()) / bin_size)));\n    ++local_bins[bin_index];\n  }\n\n  // Reduce the local histograms to the global histogram\n  MPI_Reduce(MPI_IN_PLACE, local_bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ", 10]",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (auto v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double value : x) {\n    local_bins[static_cast<size_t>(value / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins array on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i] / 10);\n    if (bin < 10) {\n      #pragma omp atomic\n      ++bins[bin];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 10> local_bins = {};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double v : x) {\n    local_bins[static_cast<size_t>(v / 10)]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the counts for a subset of the bins.\n  size_t start_bin = rank * bins.size() / nprocs;\n  size_t end_bin = (rank + 1) * bins.size() / nprocs;\n\n  std::fill(bins.begin() + start_bin, bins.begin() + end_bin, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    size_t bin = static_cast<size_t>(x[i] / 10);\n    if (bin >= start_bin && bin < end_bin) {\n      ++bins[bin];\n    }\n  }\n\n  // Gather the counts from all ranks into bins on rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    int bin = static_cast<int>(x[i]) / 10;\n    #pragma omp atomic\n    ++bins[bin];\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      std::array<size_t, 10> recv;\n      MPI_Recv(recv.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of values in its portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * x.size() / size; i < (rank + 1) * x.size() / size; ++i) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts into the global bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 10> recv_bins;\n      MPI_Recv(recv_bins.data(), 10, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 10; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 10, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of values in its local portion of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins = {};\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    local_bins[static_cast<size_t>(x[i] / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x.\n  std::array<size_t, 10> local_bins{};\n  for (double val : x) {\n    local_bins[static_cast<size_t>(val / 10)]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the number of values in its local portion of x\n  std::array<size_t, 10> local_bins = {0};\n  #pragma omp parallel for\n  for (size_t i = rank; i < x.size(); i += nprocs) {\n    int bin = static_cast<int>(x[i] / 10);\n    local_bins[bin]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 10, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(Kokkos::View<const Point*> const& points, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using Kokkos::parallel_for;\n   using Kokkos::Functor;\n\n   struct CountQuadrants {\n      Kokkos::View<size_t*> bins;\n      Kokkos::View<const Point*> points;\n\n      KOKKOS_FUNCTION\n      void operator()(const int i) const {\n         const auto& point = points[i];\n         if (point.x > 0) {\n            if (point.y > 0) {\n               ++bins[0];\n            } else {\n               ++bins[2];\n            }\n         } else {\n            if (point.y > 0) {\n               ++bins[1];\n            } else {\n               ++bins[3];\n            }\n         }\n      }\n   };\n\n   parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, int>(0, points.size()), CountQuadrants{bins, points});\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA (const size_t& i, size_t& update) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.size(), KOKKOS_LAMBDA (const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0 && y >= 0) {\n         ++bins(0);\n      } else if (x < 0 && y >= 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else {\n         ++bins(3);\n      }\n   });\n}\n",
+            "{x=1.5,y=1.1}",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x >= 0) {\n         if (y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      const Point& p = points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++update[0];\n         } else {\n            ++update[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++update[1];\n         } else {\n            ++update[2];\n         }\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, points.size()), KOKKOS_LAMBDA(const size_t& i, size_t& update) {\n      double x = points[i]->x;\n      double y = points[i]->y;\n      if (x >= 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y >= 0) {\n         ++update;\n      } else if (x < 0 && y < 0) {\n         ++update;\n      } else {\n         ++update;\n      }\n   }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = points(i);\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      if (x > 0 && y > 0) {\n         ++bins(0);\n      } else if (x < 0 && y > 0) {\n         ++bins(1);\n      } else if (x < 0 && y < 0) {\n         ++bins(2);\n      } else if (x > 0 && y < 0) {\n         ++bins(3);\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      if (points(i)->x > 0) {\n         if (points(i)->y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (points(i)->y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t idx = 0;\n      if (x >= 0) {\n         idx += 1;\n      }\n      if (y >= 0) {\n         idx += 2;\n      }\n      ++bins(idx);\n   });\n}\n",
+            "\n\n   Kokkos::parallel_for(\"countQuadrants\", points.extent(0), KOKKOS_LAMBDA (const size_t i) {\n      double x = points(i)->x;\n      double y = points(i)->y;\n      size_t bin = 0;\n      if (x > 0) {\n         bin += 1;\n      }\n      if (y > 0) {\n         bin += 2;\n      }\n      Kokkos::atomic_fetch_add(&bins(bin), 1);\n   });\n}\n",
+            "\n\n   using exec_space = KokKos::DefaultExecutionSpace;\n   using league_space = KokKos::TeamVectorRange<exec_space>;\n\n   league_space range(exec_space{}, points.size());\n\n   auto policy = KokKos::TeamVectorRange\u653f\u7b56<league_space>(range);\n\n   auto lambda = KOKKOS_LAMBDA (const Point point, size_t* count) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++count[0];\n         } else {\n            ++count[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++count[1];\n         } else {\n            ++count[2];\n         }\n      }\n   };\n\n   exec_space exec;\n   exec.parallel_reduce(\n      \"count_quadrants\", policy, lambda, Rcpp::as<size_t*>(count));\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<Point*>>(0, points.extent(0)), KOKKOS_LAMBDA(const size_t& i) {\n      const Point& p = *points(i);\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins(0);\n         } else {\n            ++bins(3);\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins(1);\n         } else {\n            ++bins(2);\n         }\n      }\n   });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const Point& p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         ++bins[0];\n      } else if (p.x < 0 && p.y >= 0) {\n         ++bins[1];\n      } else if (p.x < 0 && p.y < 0) {\n         ++bins[2];\n      } else {\n         ++bins[3];\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      const auto& p = points[i];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++oient[0];\n         } else {\n            ++oient[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++oient[1];\n         } else {\n            ++oient[2];\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      double x = points[i].x;\n      double y = points[i].y;\n      if (x > 0) {\n         if (y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (points[i].y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x > 0) {\n      if (points[i].y > 0) {\n        ++bins[0];\n      } else {\n        ++bins[3];\n      }\n    } else {\n      if (points[i].y > 0) {\n        ++bins[1];\n      } else {\n        ++bins[2];\n      }\n    }\n  }\n}\n",
+            "\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "serial",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0439759834,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0410353092
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0440207476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413137934
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (const Point& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (p.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0436179634,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0405633048
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (const auto& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0442836124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0409384768
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044139525,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.040837493
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.043511313,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0406376708
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (point.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0437841548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0407458302
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0441556072,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413348006
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0442608134,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.041254788
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (Point const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0443617894,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0411463886
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (Point const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044252674,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0413150936
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044686741,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0438148918
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044199033,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0411032972
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0441774124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0412301518
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044186696,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0408668442
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else if (p.y < 0) {\n            bins[3]++;\n         }\n      } else if (p.x < 0) {\n         if (p.y > 0) {\n            bins[1]++;\n         } else if (p.y < 0) {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0448917266,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.044159032
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& pt : points) {\n      if (pt.x > 0) {\n         if (pt.y > 0) {\n            ++bins[0];\n         } else {\n            ++bins[3];\n         }\n      } else {\n         if (pt.y > 0) {\n            ++bins[1];\n         } else {\n            ++bins[2];\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044361499,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0411441078
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0441814942,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0412135188
+                    }
+                ]
+            },
+            {
+                "generated_output": "(",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n   for (auto const& p : points) {\n      if (p.x > 0) {\n         if (p.y > 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (p.y > 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.044254241,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0410435226
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "hip",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   while (idx < N) {\n      if (points[idx].x >= 0) {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[idx].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      idx += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      int quadrant = 0;\n      if (points[gid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[gid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int quadrant = 0;\n      if (points[tid].x > 0) {\n         quadrant += 1;\n      }\n      if (points[tid].y > 0) {\n         quadrant += 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid >= N) return;\n   const Point &p = points[tid];\n   int quadrant = 0;\n   if (p.x > 0) quadrant |= 1;\n   if (p.y > 0) quadrant |= 2;\n   atomicAdd(&bins[quadrant], 1);\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      const Point &p = points[gid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "(4)",
+            "(4)",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = bid * hipBlockDim_x + tid;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   if (p.x >= 0 && p.y >= 0) {\n      atomicAdd(&bins[0], 1);\n   } else if (p.x < 0 && p.y >= 0) {\n      atomicAdd(&bins[1], 1);\n   } else if (p.x < 0 && p.y < 0) {\n      atomicAdd(&bins[2], 1);\n   } else {\n      atomicAdd(&bins[3], 1);\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int blockId = hipBlockIdx_x;\n   int blockDim = hipBlockDim_x;\n\n   for (int i = tid; i < N; i += blockDim) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      const Point p = points[gid];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[i].x >= 0 && points[i].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = get_global_id(0);\n   if (tid < N) {\n      Point p = points[tid];\n      if (p.x > 0) {\n         if (p.y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int idx = tid + bid * hipBlockDim_x;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of points. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its own quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (const auto& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += local_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0 && point.y >= 0) {\n         local_bins[0]++;\n      } else if (point.x < 0 && point.y >= 0) {\n         local_bins[1]++;\n      } else if (point.x >= 0 && point.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   for (auto const& p : points) {\n      int quadrant = 0;\n      if (p.x > 0) quadrant += 1;\n      if (p.y > 0) quadrant += 2;\n      bins[quadrant]++;\n   }\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> tmp;\n         MPI_Recv(tmp.data(), tmp.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 4; j++) {\n            bins[j] += tmp[j];\n         }\n      }\n   } else {\n      MPI_Send(bins.data(), bins.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t local_bins[4] = {0};\n   for (Point const& p : points) {\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   if (rank == 0) {\n      bins = local_bins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 4> recv_bins;\n         MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (size_t j = 0; j < 4; j++) {\n            bins[j] += recv_bins[j];\n         }\n      }\n   } else {\n      MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   bins.fill(0);\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant.\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (point.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> localBins = {0, 0, 0, 0};\n   for (auto const& point : points) {\n      if (point.x > 0) {\n         if (point.y > 0) {\n            localBins[0]++;\n         } else {\n            localBins[3]++;\n         }\n      } else {\n         if (point.y > 0) {\n            localBins[1]++;\n         } else {\n            localBins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(localBins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "cuda",
+        "prompt": "struct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\n__global__ void countQuadrants(const Point *points, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = threadIdx.x + blockIdx.x * blockDim.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x > 0) bin += 1;\n      if (points[tid].y > 0) bin += 2;\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      int bin = 0;\n      if (points[tid].x >= 0) {\n         bin += 1;\n      }\n      if (points[tid].y >= 0) {\n         bin += 2;\n      }\n      atomicAdd(&bins[bin], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = bid * blockDim.x + tid;\n   while (i < N) {\n      if (points[i].x >= 0 && points[i].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[i].x < 0 && points[i].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   while (idx < N) {\n      if (points[idx].x >= 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (points[idx].x < 0 && points[idx].y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (points[idx].x < 0 && points[idx].y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else if (points[idx].x >= 0 && points[idx].y < 0) {\n         atomicAdd(&bins[3], 1);\n      }\n      idx += blockDim.x * gridDim.x;\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      int quadrant = 0;\n      if (p.x > 0) {\n         quadrant |= 1;\n      }\n      if (p.y > 0) {\n         quadrant |= 2;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      int quadrant = 0;\n      if (p.x >= 0) {\n         quadrant += 2;\n      }\n      if (p.y >= 0) {\n         quadrant += 1;\n      }\n      atomicAdd(&bins[quadrant], 1);\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx >= N) {\n      return;\n   }\n\n   const Point &p = points[idx];\n   int quadrant = 0;\n   if (p.x >= 0) {\n      quadrant += 1;\n   }\n   if (p.y >= 0) {\n      quadrant += 2;\n   }\n   atomicAdd(&bins[quadrant - 1], 1);\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      const Point &p = points[idx];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[1], 1);\n         }\n      } else {\n         if (p.y >= 0) {\n            atomicAdd(&bins[2], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      const Point p = points[tid];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n   if (idx >= N) {\n      return;\n   }\n   const Point &p = points[idx];\n   if (p.x >= 0) {\n      if (p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   } else {\n      if (p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else {\n         atomicAdd(&bins[2], 1);\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int i = tid + bid * blockDim.x;\n   while (i < N) {\n      if (points[i].x > 0) {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[0], 1);\n         } else {\n            atomicAdd(&bins[3], 1);\n         }\n      } else {\n         if (points[i].y > 0) {\n            atomicAdd(&bins[1], 1);\n         } else {\n            atomicAdd(&bins[2], 1);\n         }\n      }\n      i += gridDim.x * blockDim.x;\n   }\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = tid + bid * blockDim.x;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   int count = 0;\n   for (int i = idx; i < N; i += blockDim.x) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         count++;\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         count++;\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         count++;\n      }\n   }\n   bins[idx] = count;\n}\n",
+            "\n   int tid = threadIdx.x;\n   int bid = blockIdx.x;\n   int idx = bid * blockDim.x + tid;\n\n   if (idx < N) {\n      Point p = points[idx];\n      if (p.x >= 0 && p.y >= 0) {\n         atomicAdd(&bins[0], 1);\n      } else if (p.x < 0 && p.y >= 0) {\n         atomicAdd(&bins[1], 1);\n      } else if (p.x < 0 && p.y < 0) {\n         atomicAdd(&bins[2], 1);\n      } else {\n         atomicAdd(&bins[3], 1);\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "22_histogram_count_quadrants",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Point {\n   double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n   Store the counts in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n\n   input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n   output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector<Point> const& points, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); i++) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the vector\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of points in its local portion of the image\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < points.size(); i += size) {\n    const Point& p = points[i];\n    if (p.x >= 0) {\n      if (p.y >= 0) {\n        local_bins[0]++;\n      } else {\n        local_bins[3]++;\n      }\n    } else {\n      if (p.y >= 0) {\n        local_bins[1]++;\n      } else {\n        local_bins[2]++;\n      }\n    }\n  }\n\n  // Gather the local counts into the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, nprocs;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& point = points[i];\n      if (point.x >= 0) {\n         if (point.y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (point.y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      const auto& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Count the number of points in each quadrant for this rank\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+:local_bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            ++local_bins[0];\n         } else {\n            ++local_bins[3];\n         }\n      } else {\n         if (points[i].y >= 0) {\n            ++local_bins[1];\n         } else {\n            ++local_bins[2];\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    if (points[i].x >= 0 && points[i].y >= 0) {\n      ++local_bins[0];\n    } else if (points[i].x < 0 && points[i].y >= 0) {\n      ++local_bins[1];\n    } else if (points[i].x < 0 && points[i].y < 0) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in each quadrant\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for\n   for (size_t i = 0; i < points.size(); ++i) {\n      Point p = points[i];\n      if (p.x >= 0 && p.y >= 0) {\n         local_bins[0]++;\n      } else if (p.x < 0 && p.y >= 0) {\n         local_bins[1]++;\n      } else if (p.x < 0 && p.y < 0) {\n         local_bins[2]++;\n      } else {\n         local_bins[3]++;\n      }\n   }\n\n   // Reduce the local counts to the global counts on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to zero\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of points in each quadrant\n  #pragma omp parallel for\n  for (size_t i = 0; i < points.size(); ++i) {\n    const Point& point = points[i];\n    if (point.x >= 0 && point.y >= 0) {\n      ++bins[0];\n    } else if (point.x < 0 && point.y >= 0) {\n      ++bins[1];\n    } else if (point.x < 0 && point.y < 0) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the number of points in its local subset of points\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma opm parallel for\n   for (size_t i = rank; i < points.size(); i += size) {\n      if (points[i].x > 0 && points[i].y > 0) {\n         ++local_bins[0];\n      } else if (points[i].x < 0 && points[i].y > 0) {\n         ++local_bins[1];\n      } else if (points[i].x < 0 && points[i].y < 0) {\n         ++local_bins[2];\n      } else if (points[i].x > 0 && points[i].y < 0) {\n         ++local_bins[3];\n      }\n   }\n\n   // Gather the local counts into bins on rank 0\n   MPI_Gather(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Initialize bins to 0\n   std::fill(bins.begin(), bins.end(), 0);\n\n   // Count the number of points in each quadrant\n   #pragma omp parallel for reduction(+: bins)\n   for (size_t i = 0; i < points.size(); ++i) {\n      if (points[i].x >= 0) {\n         if (points[i].y >= 0) {\n            bins[0]++;\n         } else {\n            bins[3]++;\n         }\n      } else {\n         if (points[i].y >= 0) {\n            bins[1]++;\n         } else {\n            bins[2]++;\n         }\n      }\n   }\n\n   // Gather the results from all ranks\n   MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the points in its portion of the image\n  #pragma omp parallel for reduction(+: bins)\n  for (size_t i = rank; i < points.size(); i += nprocs) {\n    const Point& point = points[i];\n    if (point.x >= 0) {\n      if (point.y >= 0) {\n        bins[0]++;\n      } else {\n        bins[3]++;\n      }\n    } else {\n      if (point.y >= 0) {\n        bins[1]++;\n      } else {\n        bins[2]++;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the image\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the points in its local portion of the array\n   std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n   #pragma omp parallel for reduction(+: local_bins)\n   for (size_t i = rank; i < points.size(); i += size) {\n      const Point& p = points[i];\n      if (p.x >= 0) {\n         if (p.y >= 0) {\n            local_bins[0]++;\n         } else {\n            local_bins[3]++;\n         }\n      } else {\n         if (p.y >= 0) {\n            local_bins[1]++;\n         } else {\n            local_bins[2]++;\n         }\n      }\n   }\n\n   // Gather the local counts into the global bins on rank 0\n   MPI_Reduce(local_bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use Kokkos to compute in parallel. Assume that Kokkos has already been initialized.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(Kokkos::View<const double*> const& x, Kokkos::View<size_t[4]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  const size_t num_elements = x.extent(0);\n  Kokkos::parallel_for(\"countQuartiles\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, num_elements), KOKKOS_LAMBDA(const size_t i) {\n    const double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n\n  parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - floor(x(i));\n    if (frac < 0.25) {\n      ++quart[0];\n    } else if (frac < 0.5) {\n      ++quart[1];\n    } else if (frac < 0.75) {\n      ++quart[2];\n    } else {\n      ++quart[3];\n    }\n  });\n}\n",
+            "\n\n  const size_t n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  auto exec_space = Kokkos::DefaultExecutionSpace::instance();\n  auto policy = Kokkos::RangePolicy<>(exec_space, 0, x.extent(0));\n\n  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::modf(x(i), &x(i));\n    if (frac < 0.25)\n      ++bins(0);\n    else if (frac < 0.5)\n      ++bins(1);\n    else if (frac < 0.75)\n      ++bins(2);\n    else\n      ++bins(3);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double fract = std::fmod(x[i], 1.0);\n    if (fract < 0.25) {\n      ++bins[0];\n    } else if (fract < 0.5) {\n      ++bins[1];\n    } else if (fract < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"countQuartiles\", x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = x(i) - std::floor(x(i));\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    double frac = std::fmod(x(i), 1.0);\n    if (frac < 0.25) {\n      ++bins(0);\n    } else if (frac < 0.5) {\n      ++bins(1);\n    } else if (frac < 0.75) {\n      ++bins(2);\n    } else {\n      ++bins(3);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+            "<,\u51a2~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++(std::get<0>(std::ref(x)));\n    } else if (frac < 0.5) {\n      ++(std::get<1>(std::ref(x)));\n    } else if (frac < 0.75) {\n      ++(std::get<2>(std::ref(x)));\n    } else {\n      ++(std::get<3>(std::ref(x)));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = std::fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (double const& xi : x) {\n    if (xi < 0.25) {\n      ++\u0648\u064a\u0627\u062a[0];\n    } else if (xi < 0.5) {\n      ++\u0648\u064a\u0627\u062a[1];\n    } else if (xi < 0.75) {\n      ++\u0648\u064a\u0627\u062a[2];\n    } else {\n      ++\u0648\u064a\u0627\u062a[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0545092424,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0691394644
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0550201584,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.069597308
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0553492194,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0697421084
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0542194732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0693915926
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0542791598,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0694024932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0543764144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0694307022
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.055352812,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0697670774
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.055248249,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0694214526
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0547833104,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0696321312
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0549674008,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0693356272
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.054001567,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0692486958
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0538839374,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.069179726
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0549385064,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0692894444
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      bins[0]++;\n    else if (frac < 0.5)\n      bins[1]++;\n    else if (frac < 0.75)\n      bins[2]++;\n    else\n      bins[3]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0541431358,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0695712486
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bins.fill(0);\n  for (const auto &xi : x) {\n    const auto frac = std::modf(xi, nullptr);\n    if (frac >= 0.25 && frac < 0.5) {\n      ++bins[0];\n    } else if (frac >= 0.5 && frac < 0.75) {\n      ++bins[1];\n    } else if (frac >= 0.75 && frac < 1.0) {\n      ++bins[2];\n    } else if (frac == 0.0) {\n      ++bins[3];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0533513998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0689011496
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0543147448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0695329068
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0549211678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0694415888
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0555912084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.069747186
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipGetGlobalIndex();\n  if (idx >= N) return;\n\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&quartiles[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&quartiles[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&quartiles[2], 1);\n  } else {\n    atomicAdd(&quartiles[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  while (i < N) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double frac = fmod(x[gid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int stride = hipBlockDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  double frac = fmod(x[idx], 1.0);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (f < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (f < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double frac = x[gid] - floor(x[gid]);\n  if (frac < 0.25) {\n    atomicAdd(&bins[0], 1);\n  } else if (frac < 0.5) {\n    atomicAdd(&bins[1], 1);\n  } else if (frac < 0.75) {\n    atomicAdd(&bins[2], 1);\n  } else {\n    atomicAdd(&bins[3], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Count the number of doubles in each quartile.\n  for (double v : x) {\n    if (v < 0.25) {\n      ++bins[0];\n    } else if (v < 0.5) {\n      ++bins[1];\n    } else if (v < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Gather the counts from all ranks on rank 0.\n  MPI_Gather(bins.data(), 4, MPI_SIZE_T, bins.data(), 4, MPI_SIZE_T, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n  for (auto v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25)\n      ++bins[0];\n    else if (frac < 0.5)\n      ++bins[1];\n    else if (frac < 0.75)\n      ++bins[2];\n    else\n      ++bins[3];\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of the vector\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize the local bins\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  // Count the number of doubles in each quartile\n  for (double d : x) {\n    double frac = fmod(d, 1.0);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t local_bins[4] = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bins.fill(0);\n\n  for (double val : x) {\n    double frac = val - std::floor(val);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its portion of x that fall into each bin.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (auto xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to get the global counts.\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double xi : x) {\n    double frac = xi - std::floor(xi);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  for (double v : x) {\n    double frac = v - std::floor(v);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(local_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 4; j++) {\n        bins[j] += local_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\n__global__ void countQuartiles(const double *x, size_t N, size_t bins[4]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int idx = bid * blockDim.x + tid;\n\n  if (idx < N) {\n    double frac = x[idx] - floor(x[idx]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double frac = x[tid] - floor(x[tid]);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double frac = fmod(x[tid], 1.0);\n    if (frac < 0.25)\n      atomicAdd(&bins[0], 1);\n    else if (frac < 0.5)\n      atomicAdd(&bins[1], 1);\n    else if (frac < 0.75)\n      atomicAdd(&bins[2], 1);\n    else\n      atomicAdd(&bins[3], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            ", 4]",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int stride = blockDim.x;\n\n  for (size_t i = tid + bid * stride; i < N; i += stride) {\n    double frac = fmod(x[i], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    double f = fmod(x[idx], 1.0);\n    if (f < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (f < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (f < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    double frac = fmod(x[idx], 1.0);\n    if (frac < 0.25) {\n      atomicAdd(&bins[0], 1);\n    } else if (frac < 0.5) {\n      atomicAdd(&bins[1], 1);\n    } else if (frac < 0.75) {\n      atomicAdd(&bins[2], 1);\n    } else {\n      atomicAdd(&bins[3], 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "24_histogram_count_quartile",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of doubles in the vector x that have a fractional part \n   in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n   Use MPI and OpenMP to compute in parallel. Assume that MPI has already been initialized.\n   Every rank has a complete copy of x. The result is stored in bins on rank 0.\n   Examples:\n\n   input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n   output: [2, 1, 2, 2]\n\n   input: [1.9, 0.2, 0.6, 10.1, 7.4]\n   output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector<double> const& x, std::array<size_t, 4> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t const rank = ::rank();\n  size_t const size = x.size();\n  std::fill(counts.begin(), counts.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = rank; i < size; i += ::size()) {\n    double const value = x[i];\n    if (value < 0.25) {\n      ++counts[0];\n    } else if (value < 0.5) {\n      ++counts[1];\n    } else if (value < 0.75) {\n      ++counts[2];\n    } else {\n      ++counts[3];\n    }\n  }\n\n  #pragma omp parallel sections\n  {\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::fill(counts.begin(), \n                  counts.end(), \n                  0);\n    }\n  }\n  #pragma omp section\n  {\n    if (rank == 0) {\n      std::transform(counts.begin(), \n                      counts.end(), \n                      counts.begin(), \n                      [](size_t const a, size_t const b) {\n                        return a + b;\n                      });\n    }\n  }\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of doubles in its portion of x that have a fractional part in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = rank; i < x.size(); i += size) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each bin\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each quartile\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize the bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local vector that fall into each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in its local copy of x that fall into each quartile.\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+:local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Rank 0 gathers the local counts from all ranks and stores the result in bins.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(&recv_bins, 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(&local_bins, 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the number of elements in each quartile\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 4, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of doubles in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      bins[0]++;\n    } else if (frac < 0.5) {\n      bins[1]++;\n    } else if (frac < 0.75) {\n      bins[2]++;\n    } else {\n      bins[3]++;\n    }\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  size_t n = x.size();\n  std::fill(bins.begin(), bins.end(), 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Each rank counts the number of elements in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++bins[0];\n    } else if (frac < 0.5) {\n      ++bins[1];\n    } else if (frac < 0.75) {\n      ++bins[2];\n    } else {\n      ++bins[3];\n    }\n  }\n\n  // Reduce the counts from all ranks to rank 0\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local counts\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      local_bins[0]++;\n    } else if (frac < 0.5) {\n      local_bins[1]++;\n    } else if (frac < 0.75) {\n      local_bins[2]++;\n    } else {\n      local_bins[3]++;\n    }\n  }\n\n  // Gather the local counts into bins on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of ranks\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Count the number of doubles in each quartile on each rank\n  std::array<size_t, 4> local_bins = {0, 0, 0, 0};\n  #pragma omp parallel for reduction(+: local_bins)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double frac = x[i] - std::floor(x[i]);\n    if (frac < 0.25) {\n      ++local_bins[0];\n    } else if (frac < 0.5) {\n      ++local_bins[1];\n    } else if (frac < 0.75) {\n      ++local_bins[2];\n    } else {\n      ++local_bins[3];\n    }\n  }\n\n  // Reduce the local counts to the global counts on rank 0\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < nranks; ++i) {\n      std::array<size_t, 4> recv_bins;\n      MPI_Recv(recv_bins.data(), 4, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < 4; ++j) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 4, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use Kokkos to count in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(Kokkos::View<const int*> const& image, Kokkos::View<size_t[256]> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.size(), KOKKOS_LAMBDA (const size_t i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n      bins[image[i]]++;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256),\n                          KOKKOS_LAMBDA(const int& i, size_t& update) {\n                            update += Kokkos::count(Kokkos::View<const int*, Kokkos::HostSpace>(image.data(), image.size()), i);\n                          },\n                          bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA (const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.extent(0)), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image(i)];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA (const size_t& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixelCounts\", Kokkos::RangePolicy<>(0, image.extent(0)), KOKKOS_LAMBDA (const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n      bins[i] = 0;\n   });\n\n   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n      ++bins[image[i]];\n   });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA (const int& i) {\n    bins[i] = Kokkos::atomic_fetch_add(&bins[image[i]], 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(\"pixel_count\", Kokkos::RangePolicy<>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    ++bins[image[i]];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i, size_t& sum) {\n    sum += Kokkos::count(image, i);\n  }, bins);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA (const int& i) {\n    bins[image(i)]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"pixelCounts\", image.extent(0), KOKKOS_LAMBDA(const int& i) {\n    ++bins(image(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const int& i) {\n    bins[i] = 0;\n  });\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, image.size()), KOKKOS_LAMBDA(const int& i) {\n    bins[image[i]]++;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, 256), KOKKOS_LAMBDA(const size_t& i, size_t& count) {\n    count += Kokkos::count(Kokkos::ViewConstRange(image, image.extent(0)), i);\n  }, bins);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use OpenMP to count in parallel.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "serial",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    ++Counts[pixel];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025213652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028121404
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025114848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002959185
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto const& pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026276436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028489796
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0023242022,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028446974
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002532871,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029555722
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0023946804,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027984706
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0024800496,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027772986
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026231286,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028667714
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0024785946,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002743535
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026479834,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0030210674
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0024878502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029513624
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025974264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029364042
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0027509774,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029454872
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0026262182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0027272946
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002422401,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002923019
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0024561124,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028298656
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0025619476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0028142804
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (int pixel : image) {\n      bins[pixel]++;\n   }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0028769262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.002996281
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002380542,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0029833282
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "hip",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use AMD HIP to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx >= N) return;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int bin = image[idx];\n  atomicAdd(&bins[bin], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) { bins[image[idx]]++; }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int stride = hipBlockDim_x * hipGridDim_x;\n  for (size_t i = idx; i < N; i += stride) {\n    atomicAdd(&bins[image[i]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> localBins{};\n   for (int pixel : image) {\n      localBins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   if (rank == 0) {\n      bins = localBins;\n      for (int i = 1; i < size; i++) {\n         std::array<size_t, 256> recvBins;\n         MPI_Recv(recvBins.data(), 256, MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         for (int j = 0; j < 256; j++) {\n            bins[j] += recvBins[j];\n         }\n      }\n   } else {\n      MPI_Send(localBins.data(), 256, MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  if (rank == 0) {\n    bins = local_bins;\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::fill(bins.begin(), bins.end(), 0);\n  for (int pixel : image) {\n    bins[pixel]++;\n  }\n\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> localBins;\n  for (size_t i = 0; i < image.size(); ++i) {\n    localBins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local bins into the global bins on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n   size_t chunk_size = image.size() / size;\n   size_t start = rank * chunk_size;\n   size_t end = (rank + 1) * chunk_size;\n   if (rank == size - 1) {\n      end = image.size();\n   }\n   std::array<size_t, 256> local_bins;\n   local_bins.fill(0);\n   for (size_t i = start; i < end; i++) {\n      local_bins[image[i]]++;\n   }\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Each rank counts the pixels in its portion of the image.\n   std::array<size_t, 256> local_bins{};\n   for (int pixel : image) {\n      local_bins[pixel]++;\n   }\n\n   // Reduce the local counts to the global counts on rank 0.\n   MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t n = image.size();\n  size_t local_bins[256] = {0};\n  for (size_t i = rank; i < n; i += size) {\n    local_bins[image[i]]++;\n  }\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  for (size_t i = rank; i < image.size(); i += size) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::array<size_t, 256> recv_bins;\n      MPI_Recv(recv_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < 256; j++) {\n        bins[j] += recv_bins[j];\n      }\n    }\n  } else {\n    MPI_Send(local_bins.data(), 256, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int pixel : image) {\n    local_bins[pixel]++;\n  }\n\n  // Reduce the local counts to the global counts on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "cuda",
+        "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use CUDA to count in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n__global__ void pixelCounts(const int *image, size_t N, size_t bins[256]) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  atomicAdd(&bins[image[idx]], 1);\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    ++bins[image[idx]];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    atomicAdd(&bins[image[idx]], 1);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    atomicAdd(&bins[image[tid]], 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "histogram",
+        "language": "cpp",
+        "name": "20_histogram_pixel_histogram",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Count the number of pixels in image with each grayscale intensity.\n   The vector `image` is a grayscale image with values 0-255.\n   Store the results in `bins`.\n   Use MPI and OpenMP to count in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of image. The result is stored in bins on rank 0.\n   Example:\n   \n   input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n   output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector<int> const& image, std::array<size_t, 256> &bins) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to 0\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce bins across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (size_t i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t const n = image.size();\n  size_t const chunk = (n + size - 1) / size;\n  size_t const start = rank * chunk;\n  size_t const end = std::min(start + chunk, n);\n  std::array<size_t, 256> localBins = {};\n  for (size_t i = start; i < end; ++i) {\n    ++localBins[image[i]];\n  }\n  MPI_Reduce(localBins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels with each intensity in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += size) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  size_t local_bins[256] = {0};\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Gather the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins, bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); i++) {\n    bins[image[i]]++;\n  }\n\n  // Reduce the bins from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize bins to all zeros\n  std::fill(bins.begin(), bins.end(), 0);\n\n  // Count the number of pixels in each bin\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the bins across all ranks\n  MPI_Allreduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in each bin on each rank.\n  std::array<size_t, 256> local_bins{};\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  // Use OpenMP to count in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the pixels in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the counts across all ranks\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image.\n  std::array<size_t, 256> local_bins{};\n  for (int i = rank; i < image.size(); i += nprocs) {\n    local_bins[image[i]]++;\n  }\n\n  // Reduce the local bins into the global bins on rank 0.\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of pixels in image with each grayscale intensity.\n  #pragma omp parallel for\n  for (size_t i = 0; i < image.size(); ++i) {\n    ++bins[image[i]];\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, bins.data(), bins.size(), MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank counts the pixels in its portion of the image\n  std::array<size_t, 256> local_bins;\n  local_bins.fill(0);\n  #pragma omp parallel for\n  for (size_t i = rank * image.size() / nprocs; i < (rank + 1) * image.size() / nprocs; ++i) {\n    ++local_bins[image[i]];\n  }\n\n  // Reduce the local bins to the global bins on rank 0\n  MPI_Reduce(local_bins.data(), bins.data(), 256, MPI_SIZE_T, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the first even number in the vector x.\n   Use Kokkos to parallelize the search. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(Kokkos::View<const int*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  const size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_reduce(\n      RangePolicy<>(0, n),\n      KOKKOS_LAMBDA(const size_t i) {\n        if (x(i) % 2 == 0) {\n          result = std::min(result, i);\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag<Kokkos::MaxTag, Kokkos::Plus<>, int>;\n  auto result = Kokkos::parallel_reduce(policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, int& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n    }\n  }, result, tag_type());\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = x(i) % 2 == 0;\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "_",
+            "\n\n  size_t result = x.size();\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(size_t i, Kokkos::Experimental::Automic& atom) {\n    if (x(i) % 2 == 0) {\n      atom.fetch_min(result, i);\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              Kokkos::fence();\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even(i) = (x(i) % 2 == 0);\n  });\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even(i)) {\n      firstEven = i;\n    }\n  }, firstEven);\n  return firstEven;\n}\n",
+            "\n\n  size_t result = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      Kokkos::parallel_scan(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(i + 1, x.size()), KOKKOS_LAMBDA(const size_t& j, size_t& result) {\n        result = 0;\n      });\n    }\n  });\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n  Kokkos::parallel_for(\"findFirstEven\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n),\n                      KOKKOS_LAMBDA(const size_t& i) {\n                        if (x(i) % 2 == 0) { result = i; }\n                      });\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_for(\"findFirstEven\", x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_add(&findFirstEven_result, 1);\n      Kokkos::fence();\n      if (findFirstEven_result == 1) {\n        findFirstEven_index = i;\n        Kokkos::fence();\n        Kokkos::parallel_scan(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i, const size_t& my_result) {\n                                if (findFirstEven_index == i) {\n                                  return my_result + 1;\n                                }\n                                return my_result;\n                              },\n                              0);\n        Kokkos::fence();\n        Kokkos::parallel_for(\"findFirstEven_break\", x.extent(0),\n                              KOKKOS_LAMBDA(const size_t& i) {\n                                if (findFirstEven_index == i) {\n                                  Kokkos::fence();\n                                  Kokkos::break_();\n                                }\n                              });\n      }\n    }\n  });\n  return findFirstEven_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t result = n;\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "&",
+            "\n\n  size_t N = x.extent(0);\n  size_t result = N;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      x.size());\n  return result;\n}\n",
+            "\n\n  size_t N = x.size();\n  size_t result = N;\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    if (x(i) % 2 == 0) {\n      result = i;\n      return;\n    }\n  });\n\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<bool*> even = Kokkos::View<bool*>(n);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(size_t i) {\n    even[i] = (x[i] % 2 == 0);\n  });\n\n  size_t firstEven = n;\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(size_t i, size_t& firstEven) {\n    if (even[i] && i < firstEven) {\n      firstEven = i;\n    }\n  }, firstEven);\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n        if (x(i) % 2 == 0) {\n          result = i;\n          return;\n        }\n      },\n      result);\n  return result;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, size_t& result) {\n                            if (x(i) % 2 == 0) {\n                              result = i;\n                              result = Kokkos::atomic_fetch_add(&result, 1);\n                            }\n                          },\n                          result);\n  return result;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  size_t firstEven = n;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      firstEven = i;\n      return;\n    }\n  });\n  return firstEven;\n}\n",
+            "\n\n  size_t N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    if (x(i) % 2 == 0) {\n      Kokkos::atomic_fetch_max(&findFirstEven_result, i);\n    }\n  });\n  return findFirstEven_result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use OpenMP to parallelize the search.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n#pragma omp cancel for\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < result) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t firstEven = -1;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n#pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0 && i < firstEven) {\n#pragma omp critical\n      firstEven = i;\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < firstEven) {\n        firstEven = i;\n      }\n    }\n  }\n  return firstEven;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t first_even = n;\n  #pragma  omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  omp critical\n      if (i < first_even) {\n        first_even = i;\n      }\n    }\n  }\n  return first_even;\n}\n",
+            "\n\n  size_t index = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n#pragma omp critical\n      if (i < index) {\n        index = i;\n      }\n    }\n  }\n  return index;\n}\n",
+            "\n\n  size_t result = x.size();\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t result = n;\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the first even number in the vector x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002490928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000236629
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002618288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000232069
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002482392,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002297422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002502408,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000232826
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00025723,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00023805
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002653472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002311068
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002488564,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002367456
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000282498,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002408032
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002566148,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002316178
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00024853,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002296404
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002482052,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002297548
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002710822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002494736
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002490326,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002386528
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002483476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002313692
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002523448,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002322394
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002546414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.00025732
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000259931,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002389998
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002591436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002383146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002486256,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002305898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return x.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0002657822,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002407088
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use AMD HIP to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int myIndex = hipGetGlobalIndex();\n  if (myIndex < N && x[myIndex] % 2 == 0) {\n    *firstEvenIndex = myIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N && x[i] % 2 == 0) {\n    *firstEvenIndex = i;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int blockId = hipBlockIdx_x;\n  int blockDim = hipBlockDim_x;\n  int gridDim = hipGridDim_x;\n\n  *firstEvenIndex = N;\n\n  if (x[tid] % 2 == 0) {\n    atomicMin(firstEvenIndex, tid);\n  }\n\n  __syncthreads();\n\n  if (tid == 0) {\n    if (*firstEvenIndex == N) {\n      *firstEvenIndex = -1;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N && x[gid] % 2 == 0) {\n    *firstEvenIndex = gid;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  if (tid == 0) {\n    *firstEvenIndex = N;\n  }\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = min(*firstEvenIndex, tid);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t result = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, &result, 1, MPI::INT, MPI::MINLOC);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = -1;\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result = -1;\n  MPI_Allgather(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (global_result != -1 && global_result <= i) {\n        return global_result;\n      }\n    }\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      return i;\n    }\n  }\n  return n;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = rank; i < n; i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &firstEven, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t firstEven = x.size();\n  for (size_t i = myRank; i < x.size(); i += numRanks) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = -1;\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  for (size_t i = rank; i < x.size(); i += size) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_first_even = -1;\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  int global_first_even = -1;\n  MPI_Allreduce(&local_first_even, &global_first_even, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_first_even;\n}\n",
+            "\n\n  size_t result = x.size();\n  MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int myRank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  size_t n = x.size();\n  size_t firstEven = n;\n  for (size_t i = myRank; i < n; i += MPI::procs()) {\n    if (x[i] % 2 == 0) {\n      firstEven = i;\n      break;\n    }\n  }\n  MPI_Allreduce(&firstEven, &firstEven, 1, MPI_MINLOC, MPI_INT);\n  return firstEven;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank searches for the first even number in its portion of the vector.\n  size_t my_first_even = x.size();\n  for (size_t i = my_rank; i < x.size(); i += num_ranks) {\n    if (x[i] % 2 == 0) {\n      my_first_even = i;\n      break;\n    }\n  }\n\n  // Gather the results from all ranks.\n  std::vector<size_t> first_evens(num_ranks);\n  MPI_Gather(&my_first_even, 1, MPI_UNSIGNED_LONG, first_evens.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the smallest index among the gathered results.\n  size_t first_even = x.size();\n  if (my_rank == 0) {\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (first_evens[i] < first_even) {\n        first_even = first_evens[i];\n      }\n    }\n  }\n\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t n = x.size();\n  size_t local_first_even = n;\n  for (size_t i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] % 2 == 0) {\n      local_first_even = i;\n      break;\n    }\n  }\n  size_t first_even;\n  MPI_Allreduce(&local_first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return first_even;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  size_t result = x.size();\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  int global_result;\n  MPI_Allreduce(&result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_result;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  for (size_t i = my_rank; i < x.size(); i += my_size) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_SIZE_T, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  size_t local_min = x.size();\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      local_min = i;\n      break;\n    }\n  }\n  size_t global_min;\n  MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                MPI_COMM_WORLD);\n  return global_min;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the first even number in the vector x. Store it in firstEvenIndex.\n   Use CUDA to parallelize the search. The kernel is launched with at least as many threads as values in x.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\n__global__ void findFirstEven(const int *x, size_t N, size_t *firstEvenIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n  }\n}\n",
+            "(",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (x[tid] % 2 == 0) {\n    *firstEvenIndex = tid;\n    return;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "38_search_find_the_first_even_number",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the first even number in the vector x.\n   Use MPI and OpenMP to parallelize the search. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n   output: 6\n\n   input: [3, 8, 9, 9, 3, 4, 8, 6]\n   output: 1\n*/\nsize_t findFirstEven(std::vector<int> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma  openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma  openmp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int result = -1;\n  #pragma omp parallel for reduction(min:result)\n  for (int i = 0; i < n; ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < result) {\n          result = i;\n        }\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (result == -1) {\n        result = i;\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_result = -1;\n  int global_result = -1;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (my_result == -1) {\n          my_result = i;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&my_result, &global_result, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n  return global_result;\n}\n",
+            "\n\n  int my_rank, n_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);\n\n  size_t first_even = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (i < first_even) {\n          first_even = i;\n        }\n      }\n    }\n  }\n\n  if (my_rank == 0) {\n    MPI_Reduce(&first_even, &first_even, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(&first_even, nullptr, 1, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n\n  return first_even;\n}\n",
+            "\n  size_t result = -1;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int my_result = -1;\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        my_result = i;\n        break;\n      }\n    }\n    if (my_result != -1) {\n      #pragma omp critical\n      {\n        if (result == -1 || my_result < result) {\n          result = my_result;\n        }\n      }\n    }\n  }\n  MPI_Bcast(&result, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t result = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      {\n        if (result == -1) {\n          result = i;\n        }\n      }\n      break;\n    }\n  }\n\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "_",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      #pragma omp cancel for\n    }\n  }\n\n  MPI_Allreduce(&result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  size_t first_even = x.size();\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        first_even = i;\n        break;\n      }\n    }\n  }\n\n  MPI_Allreduce(&first_even, &first_even, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return first_even;\n  } else {\n    return x.size();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t firstEven = -1;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (firstEven == -1) {\n        firstEven = i;\n      }\n    }\n  }\n\n  if (rank == 0) {\n    MPI_Bcast(&firstEven, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n\n  return firstEven;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n      break;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      #pragma omp critical\n      if (i < result) {\n        result = i;\n      }\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t result = x.size();\n  #pragma omp parallel\n  {\n    size_t local_result = x.size();\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] % 2 == 0) {\n        local_result = i;\n        break;\n      }\n    }\n    #pragma omp critical\n    result = std::min(result, local_result);\n  }\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  size_t result = x.size();\n  #pragma omp parallel for reduction(min : result)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] % 2 == 0) {\n      result = i;\n    }\n  }\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return x.size();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(Kokkos::View<const int*> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& v) { return v == target; };\n  return any(policy_t(x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains = Kokkos::parallel_reduce(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) {\n        contains |= (x(i) == target);\n      },\n      false);\n  return contains;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains_target = Kokkos::scan(\n      policy_type(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, const bool& contains_target) {\n        return contains_target || (x(i) == target);\n      },\n      false, tag_type());\n\n  return contains_target;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, false);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  Kokkos::Scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& result) {\n    if (x(i) == target) { result = 1; }\n  }, tag_type());\n\n  return Kokkos::Max<int>(Kokkos::View<int*>(1)) != 0;\n}\n",
+            "\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto contains_lambda = KOKKOS_LAMBDA(const int& value) { return value == target; };\n  return Kokkos::any(policy_type(0, x.extent(0)), contains_lambda, x);\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto result = Kokkos::parallel_reduce(\n      policy_type(0, x.size()), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) {\n          found = true;\n        }\n      },\n      false, tag_type());\n  return result;\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  auto contains = Kokkos::scan(policy_type(0, x.extent(0)), KOKKOS_LAMBDA(const int& i, const bool& contains_so_far) {\n    return contains_so_far || (x(i) == target);\n  }, false, tag_type());\n\n  return contains;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(\n      x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found) {\n        if (x(i) == target) { found = true; }\n      },\n      found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, bool& found) {\n    if (x(i) == target) {\n      found = true;\n    }\n  }, found);\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for shared(found)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      #pragma omp critical\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  return found;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  return found;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000491503,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000253014
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004850912,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002515274
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004879424,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002520384
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004868686,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002678582
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000480436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002613876
+                    }
+                ]
+            },
+            {
+                "generated_output": "&",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004828548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002594904
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004767512,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002605002
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004940618,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002531906
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004741384,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000255651
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004847222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002537796
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004929034,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.000260378
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004906756,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002515594
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004889278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002529118
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004918514,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002627608
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000484187,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002569814
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.000494082,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002515454
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004728856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002542348
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      return true;\n    }\n  }\n  return false;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0004920862,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0002605242
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            ", 8, 2, 6, 4, 6",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int offset = bid * hipBlockDim_x + tid;\n  if (offset < N && x[offset] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (found) {\n    return true;\n  }\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n      MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n      if (found) {\n        return true;\n      }\n    }\n  }\n  return false;\n}\n",
+            "\n\n  int result = 0;\n  for (int i = 0; i < x.size(); i++) {\n    result |= (x[i] == target);\n  }\n  int global_result;\n  MPI_Allreduce(&result, 1, MPI_INT, &global_result, 1, MPI_INT, MPI_BOR, MPI_COMM_WORLD);\n  if (global_result) {\n    return true;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int has_target = (std::find(x.begin(), x.end(), target) != x.end());\n  int global_has_target;\n  MPI_Allreduce(&has_target, &global_has_target, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return global_has_target;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return global_found;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_contains = 0;\n  for (int i = rank; i < n; i += size) {\n    if (x[i] == target) {\n      local_contains = 1;\n      break;\n    }\n  }\n\n  int global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n\n  return global_contains;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n  MPI::COMM_WORLD.Allreduce(MPI::IN_PLACE, &found, 1, MPI::BOOL, MPI::LOR);\n  return found;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  int local_size = x.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  bool local_contains = false;\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      local_contains = true;\n      break;\n    }\n  }\n\n  bool global_contains;\n  MPI_Allreduce(&local_contains, &global_contains, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_contains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  bool found = false;\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  for (int i = rank; i < x.size(); i += size) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int found = 0;\n  MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  bool found = std::find(x.begin(), x.end(), target) != x.end();\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if the vector x contains the value `target`. Set it to false otherwise.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\n__global__ void contains(const int *x, size_t N, int target, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    found[0] = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N && x[tid] == target) {\n    *found = true;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "36_search_check_if_array_contains_value",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if the vector x contains the value `target`. Return false otherwise.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Examples:\n\n   input: x=[1, 8, 2, 6, 4, 6], target=3\n   output: false\n   \n   input: x=[1, 8, 2, 6, 4, 6], target=8\n   output: true\n*/\nbool contains(std::vector<int> const& x, int target) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(+:local_result)\n  for (int i = rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int global_result;\n  MPI_Allreduce(&local_result, &global_result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return global_result > 0;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "(",
+            "\n\n  int rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  bool result = false;\n  #pragma omp parallel shared(result)\n  {\n  #pragma omp for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      result = true;\n      break;\n    }\n  }\n  }\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Reduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int found = 0;\n  #pragma omp parallel for reduction(+:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = 1;\n      break;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int found = 0;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); ++i) {\n    if (x[i] == target) {\n      found = 1;\n    }\n  }\n  int result;\n  MPI_Allreduce(&found, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return result;\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  bool found = false;\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        }\n      }\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_MAX, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int n = x.size();\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  MPI_Allreduce(&found, &found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_result = 0;\n  #pragma omp parallel for reduction(||:local_result)\n  for (int i = my_rank; i < n; i += MPI::COMM_WORLD.Get_size()) {\n    if (x[i] == target) {\n      local_result = 1;\n    }\n  }\n  int result;\n  MPI_Reduce(&local_result, &result, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for reduction(||:found)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found, &result, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == target) {\n      found = true;\n      break;\n    }\n  }\n\n  MPI_Allreduce(&found, &found, 1, MPI_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  return found;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  int local_size = n / MPI_COMM_SIZE;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  bool found = false;\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (x[i] == target) {\n      found = true;\n    }\n  }\n  bool global_found;\n  MPI_Allreduce(&found, &global_found, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return global_found;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(Kokkos::View<const int*> const& x, Kokkos::View<const int*> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  Kokkos::View<bool*> found_both(\"found_both\", 1);\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(const int& i, bool& found_both) {\n    found_both |= (found_x(i) && found_y(i));\n  });\n\n  return !found_both(0);\n}\n",
+            "\n\n  return Kok \u86cb\u7cd5::parallel_reduce(Kok\u86cb\u7cd5::make_range(x.size()),\n                                 Kok\u86cb\u7cd5::plus<int>(),\n                                 [x, y, val](int i) { return (x[i] == val) ^ (y[i] == val); },\n                                 0);\n}\n",
+            "\n\n  // Create a Kokkos::View of the union of the two input vectors\n  Kokkos::View<const int*> union_xy(\"union_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(i) = x(i);\n  });\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(int i) {\n    union_xy(x.extent(0) + i) = y(i);\n  });\n\n  // Create a Kokkos::View of the intersection of the two input vectors\n  Kokkos::View<const int*> intersection_xy(\"intersection_xy\", x.extent(0) + y.extent(0));\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(int i) {\n    for (int j = 0; j < y.extent(0); j++) {\n      if (x(i) == y(j)) {\n        intersection_xy(i) = x(i);\n        break;\n      }\n    }\n  });\n\n  // Check if the value is in the union but not in the intersection\n  bool found_in_union = false;\n  bool found_in_intersection = false;\n  Kokkos::parallel_for(union_xy.extent(0), KOKKOS_LAMBDA(int i) {\n    if (union_xy(i) == val) {\n      found_in_union = true;\n    }\n    if (intersection_xy(i) == val) {\n      found_in_intersection = true;\n    }\n  });\n\n  return found_in_union && !found_in_intersection;\n}\n",
+            "\n\n  using namespace Kokkos;\n  const int N = x.extent(0);\n  const int M = y.extent(0);\n\n  auto lambda = KOKKOS_LAMBDA(const int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < N; ++j) {\n      if (x(j) == val) { found_x = true; }\n    }\n    for (int j = 0; j < M; ++j) {\n      if (y(j) == val) { found_y = true; }\n    }\n    return found_x != found_y;\n  };\n\n  bool result = parallel_reduce(N, lambda, (bool)false);\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(int i) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_x = true;\n      }\n    }\n    for (int j = 0; j < y.extent(0); ++j) {\n      if (y(j) == val) {\n        found_y = true;\n      }\n    }\n    return found_x != found_y;\n  };\n\n  return parallel_reduce(x.extent(0), lambda, (bool)false);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val) != x.end();\n  auto y_contains = find(y, val) != y.end();\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View of the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Use Kokkos to search for `val` in `x` and `y` in parallel\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the result to a single boolean value\n  bool xorContains = Kokkos::reduce(result, Kokkos::And<bool>(), true);\n\n  return xorContains;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n  bool found_x = false;\n  bool found_y = false;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (x(i) == val) {\n      found_x = true;\n    }\n  });\n\n  Kokkos::parallel_for(y.extent(0), KOKKOS_LAMBDA(const int& i) {\n    if (y(i) == val) {\n      found_y = true;\n    }\n  });\n\n  return found_x != found_y;\n}\n",
+            "\n\n  Kokkos::View<bool*> result(\"result\", x.size());\n  Kokkos::parallel_for(x.size(), KOKKOS_LAMBDA(int i) {\n    result[i] = (x[i] == val) ^ (y[i] == val);\n  });\n  return Kokkos::any(result);\n}\n",
+            "\n\n  Kokkos::View<bool*> found_x(\"found_x\", x.extent(0));\n  Kokkos::View<bool*> found_y(\"found_y\", y.extent(0));\n\n  Kokkos::parallel_for(\"xorContains_x\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_x(i) = (x(i) == val);\n  });\n\n  Kokkos::parallel_for(\"xorContains_y\", y.extent(0), KOKKOS_LAMBDA(const int i) {\n    found_y(i) = (y(i) == val);\n  });\n\n  bool found_in_x = false;\n  bool found_in_y = false;\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", x.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_x(i);\n  }, found_in_x);\n\n  Kokkos::parallel_reduce(\"xorContains_reduce\", y.extent(0), KOKKOS_LAMBDA(const int i, bool& update) {\n    update = update || found_y(i);\n  }, found_in_y);\n\n  return (found_in_x != found_in_y);\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = Kokkos::parallel_reduce(\n      policy_t(0, x.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (x(i) == val); },\n      false);\n  auto y_contains = Kokkos::parallel_reduce(\n      policy_t(0, y.extent(0)),\n      KOKKOS_LAMBDA(const int& i, bool& contains) { contains |= (y(i) == val); },\n      false);\n  return x_contains ^ y_contains;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto lambda = KOKKOS_LAMBDA(const int& x_i, const int& y_i) {\n    return (x_i == val) ^ (y_i == val);\n  };\n\n  auto result = reduce(x.extent(0), lambda, (bool)false);\n\n  return result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  auto x_contains = find(x, val, policy_type(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_type(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (x(i) == val);\n  }, (int)0);\n\n  auto y_contains = Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, y.extent(0)), KOKKOS_LAMBDA(int i, int& contains) {\n    contains |= (y(i) == val);\n  }, (int)0);\n\n  return (x_contains && !y_contains) || (!x_contains && y_contains);\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (x(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int sum) {\n    return sum + (y(i) == val);\n  }, Kokkos::Sum<int>(0));\n\n  return x_contains != y_contains;\n}\n",
+            "\n\n  // Create a Kokkos::View to store the results of the search\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Perform the search in parallel\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    result(i) = (x(i) == val) ^ (y(i) == val);\n  });\n\n  // Reduce the results to a single boolean value\n  bool found = false;\n  Kokkos::parallel_reduce(\"xorContains_reduce\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA (const int i, bool& found) {\n    found |= result(i);\n  }, found);\n\n  return found;\n}\n",
+            "\n\n  // Create a Kokkos view for the result\n  Kokkos::View<bool*> result(\"result\", x.extent(0));\n\n  // Parallel for loop to check if val is in only one of x or y\n  Kokkos::parallel_for(\"xorContains\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    bool found_in_x = false;\n    bool found_in_y = false;\n    for (int j = 0; j < x.extent(0); ++j) {\n      if (x(j) == val) {\n        found_in_x = true;\n      }\n      if (y(j) == val) {\n        found_in_y = true;\n      }\n    }\n    result(i) = (found_in_x != found_in_y);\n  });\n\n  // Reduce the result to a single boolean value\n  bool final_result = Kokkos::reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(bool a, bool b) { return a || b; }, result);\n\n  return final_result;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  auto x_contains = find(x, val, policy_t(0, x.extent(0)));\n  auto y_contains = find(y, val, policy_t(0, y.extent(0)));\n  return (x_contains != x.extent(0)) ^ (y_contains != y.extent(0));\n}\n",
+            "\n\n  auto x_contains = Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (x(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  auto y_contains = Kokkos::parallel_reduce(y.extent(0), KOKKOS_LAMBDA(int i, int& contains) {\n    if (y(i) == val) { contains = 1; }\n  }, Kokkos::Scan::plus<int>());\n\n  return (x_contains == 1) ^ (y_contains == 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use OpenMP to search in parallel.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        #pragma omp critical\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        #pragma omp critical\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        #pragma omp critical\n        {\n          found_x = true;\n        }\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        #pragma omp critical\n        {\n          found_y = true;\n        }\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return (x_count + y_count) == 1;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool in_x = false;\n  bool in_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        in_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        in_y = true;\n      }\n    }\n  }\n\n  return in_x ^ in_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found = false;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  #pragma omp parallel for\n  for (size_t i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      #pragma omp critical\n      {\n        if (!found) {\n          found = true;\n        } else {\n          found = false;\n          #pragma omp cancel for\n        }\n      }\n    }\n  }\n  return found;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (size_t i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    #pragma omp for reduction(+:y_count)\n    for (size_t i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n  return x_count != y_count;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  return x_count != y_count;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x != found_y;\n}\n",
+            "\n\n  bool found_x = false;\n  bool found_y = false;\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n    #pragma omp for\n    for (size_t i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n  return found_x ^ found_y;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "serial",
+        "prompt": "/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.51926e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.01332e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.54792e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.963e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.41008e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.7888e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.45032e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.01446e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.5407e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0099e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) !=\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.39942e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.9526e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.4702e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 9.8986e-06
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.50626e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.03132e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.55992e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.09246e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.51326e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.04878e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.4772e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0029e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.51546e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.01952e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.44494e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.04496e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.4375e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.07744e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.56916e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.06622e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  bool in_x = false;\n  bool in_y = false;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      in_x = true;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      in_y = true;\n    }\n  }\n  return in_x != in_y;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.66816e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0017356564
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^\n         std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.4343e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.04756e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^\n         (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.66152e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.01248e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return (std::find(x.begin(), x.end(), val) != x.end()) ^ (std::find(y.begin(), y.end(), val) != y.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.36498e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0129e-05
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  return std::find(x.begin(), x.end(), val) != x.end() ^ std::find(y.begin(), y.end(), val) != y.end();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 1.5343e-05,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 1.0576e-05
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "hip",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int block = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + block * blockDim;\n  int end = min(start + blockDim, N);\n  bool found_x = false;\n  bool found_y = false;\n  for (int i = start; i < end; i++) {\n    found_x |= (x[i] == val);\n    found_y |= (y[i] == val);\n  }\n  found[tid] = found_x ^ found_y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    *found = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipGridSize_x) {\n      if (x[i] == val) found_x = true;\n      if (y[i] == val) found_y = true;\n    }\n    found[tid] = found_x ^ found_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_contains = x[tid] == val;\n    bool y_contains = y[tid] == val;\n    found[tid] = x_contains ^ y_contains;\n  }\n}\n",
+            ".",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has = (x[tid] == val);\n    bool y_has = (y[tid] == val);\n    found[tid] = (x_has ^ y_has);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int block_size = hipBlockDim_x;\n  int block_id = hipBlockIdx_x;\n  int i = block_id * block_size + tid;\n\n  if (i < N) {\n    bool x_contains = x[i] == val;\n    bool y_contains = y[i] == val;\n    found[i] = x_contains ^ y_contains;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    *found = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    bool found_x = (x[i] == val);\n    bool found_y = (y[i] == val);\n    found[i] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x ^ found_y);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int idx = tid + N;\n  if (idx < 2 * N) {\n    *found = (x[tid] == val) || (y[idx - N] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = tid; i < N; i += hipBlockDim_x) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool found_x = (x[tid] == val);\n    bool found_y = (y[tid] == val);\n    found[tid] = (found_x != found_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x != in_y);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    *found = (x[gid] == val) ^ (y[gid] == val);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int myRank, nRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nRanks);\n\n  bool xContains = std::find(x.begin(), x.end(), val) != x.end();\n  bool yContains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int xContainsInt = xContains ? 1 : 0;\n  int yContainsInt = yContains ? 1 : 0;\n\n  int xorContainsInt = xContainsInt ^ yContainsInt;\n\n  int globalXorContainsInt;\n  MPI_Reduce(&xorContainsInt, &globalXorContainsInt, 1, MPI_INT, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  bool globalXorContains = globalXorContainsInt != 0;\n\n  if (myRank == 0) {\n    return globalXorContains;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int total_count;\n  MPI_Allreduce(&x_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &total_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return total_count == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_contains = std::find(x.begin(), x.end(), val) != x.end();\n  int y_contains = std::find(y.begin(), y.end(), val) != y.end();\n\n  int result = x_contains ^ y_contains;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int sum;\n  MPI_Allreduce(&x_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return sum == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n  }\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (global_x_count == 0 && global_y_count > 0) return true;\n  if (global_x_count > 0 && global_y_count == 0) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) x_count++;\n    if (y[i] == val) y_count++;\n  }\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) == 1;\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n\n  int x_count_sum;\n  int y_count_sum;\n\n  MPI_Allreduce(&x_count, &x_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (x_count_sum + y_count_sum) % 2 == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return (global_x_count == 1 || global_y_count == 1);\n}\n",
+            "\n\n  int x_count = std::count(x.begin(), x.end(), val);\n  int y_count = std::count(y.begin(), y.end(), val);\n  int global_x_count, global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return (global_x_count + global_y_count) % 2 == 1;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set `found` to true if `val` is only in one of vectors x or y.\n   Set it to false if it is in both or neither.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\n__global__ void xorContains(const int *x, const int *y, size_t N, int val, bool *found) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    bool x_has = false;\n    bool y_has = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) x_has = true;\n      if (y[i] == val) y_has = true;\n    }\n    found[tid] = x_has != y_has;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (*found) ^ (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = (x[tid] == val);\n    bool inY = (y[tid] == val);\n    found[tid] = (inX ^ inY);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *found = *found || (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *found = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = x[tid] == val;\n    bool in_y = y[tid] == val;\n    found[tid] = in_x ^ in_y;\n  }\n}\n",
+            "=cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne,cernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecernecerne)cerne)cerne)cerne)",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = x_has_val ^ y_has_val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool in_x = (x[tid] == val);\n    bool in_y = (y[tid] == val);\n    found[tid] = (in_x ^ in_y);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool x_has_val = (x[tid] == val);\n    bool y_has_val = (y[tid] == val);\n    found[tid] = (x_has_val != y_has_val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    found[tid] = (x[tid] == val) ^ (y[tid] == val);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool inX = x[tid] == val;\n    bool inY = y[tid] == val;\n    found[tid] = inX ^ inY;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    bool found_x = false;\n    bool found_y = false;\n    for (int i = 0; i < N; i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n    found[tid] = (found_x != found_y);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "39_search_xor_contains",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return true if `val` is only in one of vectors x or y.\n   Return false if it is in both or neither.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Return the result on rank 0.\n   Examples:\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n   output: true\n\n   input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n   output: false\n*/\nbool xorContains(std::vector<int> const& x, std::vector<int> const& y, int val) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int xor_count = x_count + y_count;\n\n  int result;\n  MPI_Allreduce(&xor_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Allreduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (global_x_count == 1 || global_y_count == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count;\n  int global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_x_count != global_y_count;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < n; i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int j = 0; j < m; j++) {\n      if (y[j] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  return result == 1;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < n; i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < m; i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  bool ans = (result == 1);\n\n  MPI_Reduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  return ans;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  MPI_Allreduce(MPI_IN_PLACE, &x_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(MPI_IN_PLACE, &y_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    return (x_count == 1 || y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int n = x.size();\n  int m = y.size();\n  int x_count = 0;\n  int y_count = 0;\n\n#pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int nt = omp_get_num_threads();\n    for (int i = tid; i < n; i += nt) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n    for (int i = tid; i < m; i += nt) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int x_count_global;\n  int y_count_global;\n  MPI_Allreduce(&x_count, &x_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &y_count_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (x_count_global == 1 || y_count_global == 1) {\n    return true;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int n_threads = omp_get_num_threads();\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += n_threads) {\n    if (x[i] == val) {\n      found_x = true;\n    }\n    if (y[i] == val) {\n      found_y = true;\n    }\n  }\n\n  bool result;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_XOR, MPI_COMM_WORLD);\n\n  return result;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank searches for `val` in its portion of x and y.\n  int my_start = my_rank * n / nprocs;\n  int my_end = (my_rank + 1) * n / nprocs;\n  bool found_in_x = false;\n  bool found_in_y = false;\n  #pragma omp parallel for\n  for (int i = my_start; i < my_end; i++) {\n    if (x[i] == val) {\n      found_in_x = true;\n    }\n    if (y[i] == val) {\n      found_in_y = true;\n    }\n  }\n\n  // Reduce the results from all ranks to rank 0.\n  bool result;\n  MPI_Reduce(&found_in_x, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&found_in_y, &result, 1, MPI_C_BOOL, MPI_LOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 returns the final result.\n  if (my_rank == 0) {\n    return result != (found_in_x && found_in_y);\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); ++i) {\n      if (x[i] == val) {\n        ++x_count;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); ++i) {\n      if (y[i] == val) {\n        ++y_count;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_count)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_count++;\n      }\n    }\n\n    #pragma omp for reduction(+:y_count)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_count, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int count_x = 0;\n  int count_y = 0;\n\n  #pragma omp parallel for reduction(+:count_x)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      count_x++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:count_y)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      count_y++;\n    }\n  }\n\n  int result = count_x + count_y;\n  bool ans = (result == 1);\n\n  MPI_Allreduce(&ans, &ans, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD);\n\n  return ans;\n}\n",
+            "\n\n  int x_count = 0, y_count = 0;\n  #pragma omp parallel\n  {\n    int my_x_count = 0, my_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) my_x_count++;\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) my_y_count++;\n    }\n    #pragma omp critical\n    {\n      x_count += my_x_count;\n      y_count += my_y_count;\n    }\n  }\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  if (result == 1) return true;\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = 0; i < y.size(); i++) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result = x_count + y_count;\n  MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  #pragma omp parallel reduction(+:x_count, y_count)\n  {\n    int tid = omp_get_thread_num();\n    int nthreads = omp_get_num_threads();\n    for (int i = tid; i < x.size(); i += nthreads) {\n      if (x[i] == val) {\n        x_count++;\n      }\n      if (y[i] == val) {\n        y_count++;\n      }\n    }\n  }\n\n  int global_x_count, global_y_count;\n  MPI_Reduce(&x_count, &global_x_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &global_y_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return (global_x_count == 1 || global_y_count == 1);\n  }\n  return false;\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int my_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &my_size);\n\n  bool found_x = false;\n  bool found_y = false;\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        found_x = true;\n      }\n    }\n\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        found_y = true;\n      }\n    }\n  }\n\n  bool result = false;\n  MPI_Allreduce(&found_x, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&found_y, &result, 1, MPI_C_BOOL, MPI_BXOR, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return result;\n  } else {\n    return false;\n  }\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  #pragma omp parallel for reduction(+:x_count)\n  for (int i = rank; i < x.size(); i += nprocs) {\n    if (x[i] == val) {\n      x_count++;\n    }\n  }\n\n  #pragma omp parallel for reduction(+:y_count)\n  for (int i = rank; i < y.size(); i += nprocs) {\n    if (y[i] == val) {\n      y_count++;\n    }\n  }\n\n  int result;\n  MPI_Reduce(&x_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&y_count, &result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return result == 1;\n  }\n  return false;\n}\n",
+            "\n\n  int x_contains = 0;\n  int y_contains = 0;\n\n  #pragma omp parallel\n  {\n    #pragma omp for reduction(+:x_contains)\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        x_contains = 1;\n      }\n    }\n\n    #pragma omp for reduction(+:y_contains)\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        y_contains = 1;\n      }\n    }\n  }\n\n  int result = 0;\n  MPI_Allreduce(&x_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n  MPI_Allreduce(&y_contains, &result, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);\n\n  return result == 1;\n}\n",
+            "\n\n  int x_count = 0;\n  int y_count = 0;\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  #pragma omp parallel\n  {\n    int local_x_count = 0;\n    int local_y_count = 0;\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == val) {\n        local_x_count++;\n      }\n    }\n    #pragma omp for\n    for (int i = 0; i < y.size(); i++) {\n      if (y[i] == val) {\n        local_y_count++;\n      }\n    }\n    #pragma omp critical\n    {\n      x_count += local_x_count;\n      y_count += local_y_count;\n    }\n  }\n  if (my_rank == 0) {\n    return x_count != y_count;\n  }\n  return false;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Book {\n   const char *title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use Kokkos to search in parallel. Assume Kokkos is already initialized.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(Kokkos::View<const Book*> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using range_t = Kokkos::Range<policy_t, size_t>;\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(range_t(0, books.size()), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = books.size() - 1;\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book_index) {\n      if (books[i]->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_for(books.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t result = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& result) {\n      if (books(i)->pages < 100) {\n         result = i;\n      }\n   });\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   Kokkos::View<size_t> lastShortBook(\"lastShortBook\", books.extent(0));\n   Kokkos::parallel_for(\"findLastShortBook\", books.extent(0), KOKKOS_LAMBDA(const size_t i) {\n      if (books(i).pages < 100) {\n         lastShortBook(i) = i;\n      }\n   });\n   size_t result = books.extent(0);\n   for (size_t i = books.extent(0) - 1; i >= 0; i--) {\n      if (lastShortBook(i) != 0) {\n         result = lastShortBook(i);\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books[i].pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_idx = books.size();\n   Kokkos::parallel_for(books.size(), KOKKOS_LAMBDA(const size_t& i) {\n      if (books[i]->pages < 100) {\n         last_idx = i;\n      }\n   });\n   return last_idx;\n}\n",
+            "\n\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const int i, const size_t& last) {\n      if (books(i).pages < 100) {\n         return i;\n      }\n      return last;\n   }, 0);\n}\n",
+            "\n\n   size_t last_index = books.size() - 1;\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books(i).pages >= 100) {\n         last_index = i - 1;\n      }\n   }\n   return last_index;\n}\n",
+            "\n\n   size_t last_short_book = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last) {\n      if (books(i)->pages < 100) {\n         last = i;\n      }\n   }, last_short_book);\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n   Kokkos::parallel_reduce(books.size(), KOKKOS_LAMBDA(size_t i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   });\n   return last_short_book;\n}\n",
+            "\n\n   size_t last_short_book = books.size();\n\n   Kokkos::parallel_reduce(books.extent(0), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book) {\n      if (books(i)->pages < 100) {\n         last_short_book = i;\n      }\n   }, last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books[i]->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBook = books.size() - 1;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.size()), KOKKOS_LAMBDA(const size_t& i, size_t& lastShortBook) {\n      if (books(i)->pages < 100) {\n         lastShortBook = i;\n      }\n   });\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   });\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_index = 0;\n   Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_index) {\n      if (books(i)->pages < 100) {\n         last_index = i;\n      }\n   });\n   return last_index;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n   using l_type = long;\n\n   auto last_short_book = Kokkos::parallel_scan(\n      policy_type(0, books.size()),\n      KOKKOS_LAMBDA(const size_t& i, l_type& last) {\n         if (books[i]->pages < 100) {\n            last = i;\n         }\n      },\n      last_short_book);\n\n   return last_short_book;\n}\n",
+            "\n\n   using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n   using tag_type = Kokkos::ScanTag;\n\n   size_t last_short_book_index = 0;\n   Kokkos::parallel_scan(policy_type(0, books.extent(0)), KOKKOS_LAMBDA(const size_t& i, size_t& last_short_book_index) {\n      if (books(i)->pages < 100) {\n         last_short_book_index = i;\n      }\n   }, tag_type());\n\n   return last_short_book_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use OpenMP to search in parallel.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t last_short_book_index = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short_book_index = i;\n      }\n   }\n   return last_short_book_index;\n}\n",
+            "\n\n   size_t last_short = books.size();\n   #pragma omp parallel for reduction(max:last_short)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         last_short = i;\n      }\n   }\n   return last_short;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for reduction(max:lastShortBook)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "_",
+            "\n\n   size_t last_idx = books.size() - 1;\n#pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         last_idx = i - 1;\n         break;\n      }\n   }\n   return last_idx;\n}\n",
+            "\n\n  size_t last_short_book_index = books.size() - 1;\n#pragma omp parallel for\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages >= 100) {\n#pragma omp critical\n      last_short_book_index = i - 1;\n    }\n  }\n  return last_short_book_index;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t lastShortBookIndex = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBookIndex = i;\n      }\n   }\n   return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBook = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      lastShortBook = i;\n    }\n  }\n  return lastShortBook;\n}\n",
+            "\n\n  size_t lastShortBookIndex = 0;\n#pragma omp parallel for reduction(max : lastShortBookIndex)\n  for (size_t i = 0; i < books.size(); ++i) {\n    if (books[i].pages < 100) {\n      lastShortBookIndex = i;\n    }\n  }\n  return lastShortBookIndex;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031193062,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038556264
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031236206,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038543736
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031194184,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038504424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030988602,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038399868
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031147878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038562752
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003108987,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038567766
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031492066,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038856108
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031249532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038785934
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031283332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038452124
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031088406,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038481382
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0030922378,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038312984
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = books.size() - 1; i >= 0; i--) {\n    if (books[i].pages < 100) {\n      return i;\n    }\n  }\n  return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003127835,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038841436
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003099419,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038336046
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031477098,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003876784
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003140993,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038650476
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031333948,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038819438
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003122791,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038490718
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031192102,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.003866248
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; i--) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return -1;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0031448546,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038613608
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n   for (size_t i = books.size() - 1; i >= 0; --i) {\n      if (books[i].pages < 100) {\n         return i;\n      }\n   }\n   return books.size();\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.003096796,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0038347868
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use AMD HIP to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "_",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = -1;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         lastShortBook = i - 1;\n         break;\n      }\n   }\n\n   size_t globalLastShortBook;\n   MPI_Allreduce(&lastShortBook, &globalLastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n\n   if (rank == 0) {\n      return globalLastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   for (size_t i = result + 1; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n         break;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   int globalResult;\n   MPI_Allreduce(&result, &globalResult, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);\n   return globalResult;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   if (result == books.size()) {\n      result = -1;\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   MPI_Allreduce(&result, &result, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   MPI_Allreduce(&result, &result, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Book {\n   const char* title;\n   int pages;\n};\n\n/* Find the index of the last Book item in the vector books where Book.pages is less than 100.\n   Store the result in lastShortBookIndex.\n   Use CUDA to search in parallel. The kernel is launched with one thread for every book element.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\n__global__ void findLastShortBook(const Book *books, size_t N, size_t *lastShortBookIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx < N) {\n      if (books[idx].pages < 100) {\n         *lastShortBookIndex = idx;\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (tid < N && books[tid].pages < 100) {\n\t\t*lastShortBookIndex = tid;\n\t}\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N && books[tid].pages < 100) {\n      *lastShortBookIndex = tid;\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x;\n   if (idx >= N) {\n      return;\n   }\n   if (books[idx].pages < 100) {\n      *lastShortBookIndex = idx;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "35_search_search_for_last_struct_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Book {\n   std::string title;\n   int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n   Use MPI and OpenMP to search in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of books. Return the result on rank 0.\n\t Example:\n\n   input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n   output: 2\n*/\nsize_t findLastShortBook(std::vector<Book> const& books) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for reduction(max:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   size_t lastShortBook = books.size();\n   #pragma omp parallel for reduction(min: lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n\n   if (rank == 0) {\n      return lastShortBook;\n   } else {\n      return books.size();\n   }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for reduction(max:lastShortBook)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size() - 1;\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages >= 100) {\n         result = i - 1;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t lastShortBook = 0;\n   #pragma omp parallel for\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         lastShortBook = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &lastShortBook, 1, MPI_UNSIGNED_LONG, MPI_MAX, MPI_COMM_WORLD);\n   return lastShortBook;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   size_t local_result = books.size();\n\n   #pragma omp parallel reduction(min: local_result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         local_result = i;\n      }\n   }\n\n   MPI_Allreduce(&local_result, &result, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);\n\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "\n\n   int myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min: result)\n   for (size_t i = 0; i < books.size(); i++) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   if (myRank == 0) {\n      return result;\n   }\n   return books.size();\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allreduce(MPI_IN_PLACE, &result, 1, MPI_UNSIGNED_LONG, MPI_MIN, MPI_COMM_WORLD);\n   return result;\n}\n",
+            "\n\n  size_t result = books.size();\n  size_t rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  #pragma omp parallel for reduction(min:result)\n  for (size_t i = 0; i < books.size(); i++) {\n    if (books[i].pages < 100) {\n      result = i;\n    }\n  }\n\n  if (rank == 0) {\n    return result;\n  } else {\n    return books.size();\n  }\n}\n",
+            "\n\n   size_t result = books.size();\n   #pragma omp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   return result;\n}\n",
+            "_",
+            "\n\n   size_t result = books.size() - 1;\n   size_t myRank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   #pragma  openmp parallel for reduction(min:result)\n   for (size_t i = 0; i < books.size(); ++i) {\n      if (books[i].pages < 100) {\n         result = i;\n      }\n   }\n   MPI_Allgather(&result, 1, MPI_UNSIGNED_LONG, &result, 1, MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n   if (myRank == 0) {\n      result = books.size() - 1;\n   }\n   return result;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use Kokkos to search in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(Kokkos::View<const double*> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n                          KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n                            double distance = std::abs(x(i) - M_PI);\n                            if (distance < min_distance) {\n                              min_distance = distance;\n                              min_index = i;\n                            }\n                          },\n                          min_distance, min_index);\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n  Kokkos::parallel_for(\n      policy_type(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      });\n  return closest_idx;\n}\n",
+            "\n\n  const double pi = M_PI;\n  Kokkos::View<double*> closest_distance = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> closest_index = Kokkos::View<size_t*>(1);\n  closest_distance[0] = std::numeric_limits<double>::max();\n  closest_index[0] = 0;\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double distance = std::abs(x[i] - pi);\n    if (distance < closest_distance[0]) {\n      closest_distance[0] = distance;\n      closest_index[0] = i;\n    }\n  });\n\n  return closest_index[0];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(size_t i, double& closest_distance, size_t& closest_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }, closest_distance, closest_index);\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  const double pi = M_PI;\n  size_t N = x.extent(0);\n  size_t closest_idx = 0;\n  double min_dist = std::abs(x(0) - pi);\n\n  Kokkos::parallel_for(\n      policy_t(0, N), KOKKOS_LAMBDA(const size_t& i) {\n        double dist = std::abs(x(i) - pi);\n        if (dist < min_dist) {\n          closest_idx = i;\n          min_dist = dist;\n        }\n      });\n\n  return closest_idx;\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Functor;\n  using Kokkos::RangePolicy;\n  using Kokkos::TaskPolicy;\n  using Kokkos::Work;\n\n  struct findToPiFunctor {\n    Kokkos::View<const double*> const x;\n    double min_diff;\n    int min_index;\n\n    findToPiFunctor(Kokkos::View<const double*> const x) : x(x), min_diff(std::numeric_limits<double>::max()), min_index(-1) {}\n\n    KOKKOS_INLINE_FUNCTION\n    void operator()(const Kokkos::Work& work) const {\n      const int i = work.league_rank();\n      const double diff = std::abs(x(i) - M_PI);\n      if (diff < min_diff) {\n        min_diff = diff;\n        min_index = i;\n      }\n    }\n  };\n\n  findToPiFunctor f(x);\n\n  Kokkos::parallel_for(RangePolicy<>(0, x.size()), f);\n\n  return f.min_index;\n}\n",
+            "\n\n  size_t n = x.extent(0);\n  Kokkos::View<double*> min_diff = Kokkos::View<double*>(1);\n  Kokkos::View<size_t*> min_index = Kokkos::View<size_t*>(1);\n\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff(0) || (diff == min_diff(0) && x(i) > x(min_index(0)))) {\n      min_diff(0) = diff;\n      min_index(0) = i;\n    }\n  });\n\n  return min_index(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto closest_idx = 0;\n  auto min_diff = std::numeric_limits<double>::max();\n  auto pi = M_PI;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& closest_idx) {\n    auto diff = std::abs(x(i) - pi);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }, min_diff, closest_idx);\n\n  return closest_idx;\n}\n",
+            "\n\n  using ExecutionSpace = Kokkos::DefaultExecutionSpace;\n  using Policy = Kokkos::RangePolicy<ExecutionSpace>;\n\n  auto min_distance = Kokkos::Max<double>();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(Policy(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance_local) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < min_distance_local) {\n      min_distance_local = distance;\n      min_index = i;\n    }\n  }, min_distance);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace std;\n\n  const double pi = M_PI;\n  const size_t n = x.extent(0);\n  double min_diff = numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA(const size_t& i, double& min_diff_local, size_t& min_idx_local) {\n    double diff = abs(x(i) - pi);\n    if (diff < min_diff_local) {\n      min_diff_local = diff;\n      min_idx_local = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  size_t N = x.extent(0);\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(N, KOKKOS_LAMBDA(size_t i, double& min_diff, size_t& min_idx) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }, min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    double diff = std::abs(x(i) - M_PI);\n    if (diff < closest_diff) {\n      closest_idx = i;\n      closest_diff = diff;\n    }\n  });\n\n  return closest_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(policy_t(0, x.size()), KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n    double distance = std::abs(x(i) - M_PI);\n    if (distance < min_distance) {\n      min_distance = distance;\n      min_index = i;\n    }\n  }, min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_distance = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()),\n      KOKKOS_LAMBDA(const size_t& i, double& min_distance, size_t& min_index) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < min_distance) {\n          min_distance = distance;\n          min_index = i;\n        }\n      },\n      min_distance, min_index);\n\n  return min_index;\n}\n",
+            "\n\n  using Kokkos::parallel_reduce;\n  using Kokkos::value_type;\n\n  size_t N = x.size();\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n      KOKKOS_LAMBDA(const size_t& i, value_type& min_diff_local, value_type& min_idx_local) {\n        double diff = std::abs(x(i) - M_PI);\n        if (diff < min_diff_local) {\n          min_diff_local = diff;\n          min_idx_local = i;\n        }\n      },\n      min_diff, min_idx);\n\n  return min_idx;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  size_t n = x.extent(0);\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x(0) - M_PI);\n\n  Kokkos::parallel_for(\n      policy_type(0, n), KOKKOS_LAMBDA(const size_t& i) {\n        double distance = std::abs(x(i) - M_PI);\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      });\n\n  return closest_index;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Reducer = Kokkos::Min<double, Policy>;\n\n  auto min_dist = Reducer(x.data(), x.size());\n  double min_val = min_dist.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto dist_from_pi = Reducer(x.data(), x.size(),\n                               [min_val](double& a, const double& b) {\n                                 a = std::min(a, std::abs(b - M_PI));\n                               });\n  double closest_dist = dist_from_pi.template reduce<Kokkos::MaxReduceFlag>();\n\n  auto closest_idx = Reducer(x.data(), x.size(),\n                               [closest_dist, min_val](size_t& a, const double& b) {\n                                 if (std::abs(b - M_PI) == closest_dist &&\n                                     std::abs(b - M_PI) < std::abs(min_val - M_PI)) {\n                                   a = a + 1;\n                                 }\n                               });\n  size_t idx = closest_idx.template reduce<Kokkos::MaxReduceFlag>();\n\n  return idx - 1;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  Kokkos::parallel_reduce(x.extent(0), KOKKOS_LAMBDA(size_t i, size_t& min_idx, double& min_dist) {\n    double dist = std::abs(x(i) - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }, min_idx, min_dist);\n\n  return min_idx;\n}\n",
+            "\n\n  auto closest = Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(size_t i, size_t closest) {\n        auto diff = std::abs(x(i) - M_PI);\n        if (closest == -1 || diff < std::abs(x(closest) - M_PI)) {\n          closest = i;\n        }\n        return closest;\n      },\n      -1);\n  return closest;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use OpenMP to search in parallel.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n\n  #pragma omp parallel for reduction(min:min_diff) reduction(min:min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::numeric_limits<double>::max();\n\n#pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n#pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_idx = i;\n          closest_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_dist = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < closest_dist) {\n      #pragma omp critical\n      {\n        if (dist < closest_dist) {\n          closest_dist = dist;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n#pragma  for reduction(min:closest_diff)\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Find the index of the value in the vector x that is closest to the math constant PI\n  // Use M_PI for the value of PI\n  // Use OpenMP to search in parallel\n\n  // Get the number of threads to use\n  int num_threads = omp_get_num_threads();\n\n  // Create a reduction variable to store the index of the closest value\n  int closest_index = 0;\n\n  // Create a reduction variable to store the smallest difference between a value in x and PI\n  double smallest_difference = std::numeric_limits<double>::max();\n\n  // Parallel for loop to search for the closest value\n  #pragma omp parallel for reduction(min : smallest_difference) reduction(min : closest_index)\n  for (int i = 0; i < static_cast<int>(x.size()); i++) {\n    // Calculate the difference between the current value and PI\n    double difference = std::abs(x[i] - M_PI);\n\n    // If the difference is smaller than the smallest difference found so far, update the closest index and smallest difference\n    if (difference < smallest_difference) {\n      closest_index = i;\n      smallest_difference = difference;\n    }\n  }\n\n  // Return the index of the closest value\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_index)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n#pragma omp parallel for reduction(min : min_diff) reduction(min : min_idx)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      #pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          closest_index = i;\n        }\n      }\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          min_diff = diff;\n          min_index = i;\n        }\n      }\n    }\n  }\n\n  return min_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      #pragma omp critical\n      {\n        if (diff < closest_diff) {\n          closest_diff = diff;\n          closest_idx = i;\n        }\n      }\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n#pragma omp critical\n      {\n        if (distance < closest_distance) {\n          closest_distance = distance;\n          closest_index = i;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n#pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n#pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_index = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_index;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "serial",
+        "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001542415,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001533634
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015668988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015822116
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double minDiff = std::numeric_limits<double>::max();\n  size_t minDiffIndex = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  return minDiffIndex;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015411782,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015631778
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015677484,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015652996
+                    }
+                ]
+            },
+            {
+                "generated_output": ": 2",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015525718,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001557012
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_dist = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double dist = std::abs(x[i] - M_PI);\n    if (dist < min_dist) {\n      min_dist = dist;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015721966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015514338
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015719888,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015594888
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00156626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015693272
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015551166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015991054
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015608574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015676904
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015530664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015731206
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015281938,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015496706
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015874998,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015806848
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n  return min_diff_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.001557437,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015687364
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015748798,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015851194
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015497784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.001577599
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015673476,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015468714
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015411886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0015576294
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  double closest = std::abs(x[0] - M_PI);\n  size_t closest_index = 0;\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0015593386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0016208202
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use AMD HIP to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = get_global_id(0);\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n\n  for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "losest",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = gid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (diff < abs(x[closestToPiIndex[0]] - M_PI)) {\n      closestToPiIndex[0] = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    double diff = abs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double diff = fabs(x[gid] - M_PI);\n    if (gid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = gid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(size);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0);\n\n  // Find the overall closest index\n  size_t overall_closest_index = 0;\n  double overall_closest_diff = std::abs(x[closest_indices[0]] - M_PI);\n  for (size_t i = 1; i < size; ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < overall_closest_diff) {\n      overall_closest_diff = diff;\n      overall_closest_index = closest_indices[i];\n    }\n  }\n\n  // Return the overall closest index\n  return overall_closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  return min_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_diff_index = 0;\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  int global_min_diff_index;\n  MPI_Allreduce(&min_diff_index, &global_min_diff_index, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n\n  return global_min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n  return closest_index;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  return min_idx;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < n; ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t n = x.size();\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < n; i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_index = i;\n    }\n  }\n  int result;\n  MPI_Reduce(&min_index, &result, 1, MPI_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  return result;\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the index of the value in the vector that is closest to PI on each rank\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  // Reduce the closest indices to find the global closest index\n  size_t globalClosestIndex;\n  MPI_Reduce(&closestIndex, &globalClosestIndex, 1, MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (rank == 0) {\n    return globalClosestIndex;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_idx = i;\n    }\n  }\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n  return result;\n}\n",
+            "\n\n  size_t closest = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest = i;\n    }\n  }\n  return closest;\n}\n",
+            "\n\n  double min_diff = std::numeric_limits<double>::max();\n  size_t min_idx = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_idx = i;\n    }\n  }\n  int global_min_idx;\n  MPI_Allreduce(&min_idx, &global_min_idx, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return global_min_idx;\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the closest value to PI on each rank\n  double closest = std::numeric_limits<double>::max();\n  size_t closest_index = 0;\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest) {\n      closest = diff;\n      closest_index = i;\n    }\n  }\n\n  // Reduce the closest values to rank 0\n  MPI_Reduce(&closest_index, &closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return closest_index;\n  } else {\n    return 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the index of the value in the vector x that is closest to the math constant PI. Store the index in closestToPiIndex.\n   Use M_PI for the value of PI.\n   Use CUDA to search in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\n__global__ void findClosestToPi(const double *x, size_t N, size_t *closestToPiIndex) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  double minDiff = abs(x[tid] - M_PI);\n  int minDiffIndex = tid;\n  for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n    double diff = abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      minDiff = diff;\n      minDiffIndex = i;\n    }\n  }\n  if (tid == 0) {\n    *closestToPiIndex = minDiffIndex;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = fabs(x[i] - M_PI);\n    if (i == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0 || diff < fabs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = 0;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0 || diff < minDiff) {\n      minDiff = diff;\n      *closestToPiIndex = tid;\n    }\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x;\n  if (idx < N) {\n    double diff = abs(x[idx] - M_PI);\n    if (idx == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = idx;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = abs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = abs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (i == 0 || diff < abs(x[*closestToPiIndex] - M_PI)) {\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    double diff = abs(x[i] - M_PI);\n    if (tid == 0 || diff < closestDiff) {\n      closestDiff = diff;\n      *closestToPiIndex = i;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double diff = fabs(x[tid] - M_PI);\n    if (tid == 0) {\n      *closestToPiIndex = tid;\n    } else {\n      double prevDiff = fabs(x[*closestToPiIndex] - M_PI);\n      if (diff < prevDiff) {\n        *closestToPiIndex = tid;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "search",
+        "language": "cpp",
+        "name": "37_search_find_the_closest_number_to_pi",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Return the index of the value in the vector x that is closest to the math constant PI.\n   Use M_PI for the value of PI.\n   Use MPI and OpenMP to search in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n\n   input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n   output: 1\n*/\nsize_t findClosestToPi(std::vector<double> const& x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestDistance = distance;\n      closestIndex = i;\n    }\n  }\n\n  int globalClosestIndex;\n  MPI_Allreduce(&closestIndex, &globalClosestIndex, 1, MPI_INT, MPI_MINLOC, MPI_COMM_WORLD);\n  return globalClosestIndex;\n}\n",
+            "\n\n  // Find the closest value to PI in parallel\n  size_t closest_index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_index = i;\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(MPI::COMM_WORLD.Get_size());\n  MPI::COMM_WORLD.Allgather(&closest_index, 1, MPI::UNSIGNED_LONG, &closest_indices[0], 1, MPI::UNSIGNED_LONG);\n\n  // Find the global closest index\n  size_t global_closest_index = 0;\n  min_diff = std::numeric_limits<double>::max();\n  for (size_t i = 0; i < closest_indices.size(); ++i) {\n    double diff = std::abs(x[closest_indices[i]] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      global_closest_index = closest_indices[i];\n    }\n  }\n\n  return global_closest_index;\n}\n",
+            "\n\n  // Find the closest value to PI on each rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest index to rank 0\n  size_t global_closest_index;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n\n  // Return the global closest index on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double closest_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < closest_diff) {\n      closest_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  double min_diff = std::numeric_limits<double>::max();\n  int min_diff_index = -1;\n\n  #pragma omp parallel for reduction(min : min_diff) reduction(min : min_diff_index)\n  for (int i = 0; i < x.size(); i++) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      min_diff_index = i;\n    }\n  }\n\n  return min_diff_index;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t result = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min: min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      result = i;\n    }\n  }\n\n  return result;\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the index of the value in the vector x that is closest to the math constant PI.\n  size_t index = 0;\n  double min_diff = std::numeric_limits<double>::max();\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      index = i;\n    }\n  }\n\n  // Return the result on rank 0.\n  if (rank == 0) {\n    return index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  // Get the number of ranks\n  int num_ranks;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Find the closest value to PI on this rank\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n  for (size_t i = 1; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_index = i;\n      closest_distance = distance;\n    }\n  }\n\n  // Reduce the closest indices and distances to find the overall closest value\n  size_t global_closest_index;\n  double global_closest_distance;\n  MPI_Reduce(&closest_index, &global_closest_index, 1, MPI_UNSIGNED_LONG, MPI_MINLOC, 0, MPI_COMM_WORLD);\n  MPI_Reduce(&closest_distance, &global_closest_distance, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  // Return the result on rank 0\n  if (rank == 0) {\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for reduction(min:closest_distance)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double minDiff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < minDiff) {\n      #pragma omp critical\n      if (diff < minDiff) {\n        closestIndex = i;\n        minDiff = diff;\n      }\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::numeric_limits<double>::max();\n\n  #pragma omp parallel for reduction(min:min_diff)\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      min_diff = diff;\n      closest_idx = i;\n    }\n  }\n\n  return closest_idx;\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Initialize the index of the closest value to -1\n  size_t closest_index = -1;\n\n  // Initialize the minimum distance to a large value\n  double min_distance = std::numeric_limits<double>::max();\n\n  // Loop over the elements of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Calculate the distance between the current element and PI\n    double distance = std::abs(x[i] - M_PI);\n\n    // Update the closest index and minimum distance if the current distance is smaller\n    #pragma omp critical\n    {\n      if (distance < min_distance) {\n        closest_index = i;\n        min_distance = distance;\n      }\n    }\n  }\n\n  // Gather the closest indices from all ranks\n  std::vector<size_t> closest_indices(num_ranks);\n  MPI_Gather(&closest_index, 1, MPI_UNSIGNED_LONG, closest_indices.data(), 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Find the index with the smallest distance\n  if (rank == 0) {\n    size_t global_closest_index = -1;\n    double global_min_distance = std::numeric_limits<double>::max();\n    for (size_t i = 0; i < num_ranks; ++i) {\n      if (closest_indices[i] != -1 && x[closest_indices[i]] < global_min_distance) {\n        global_closest_index = closest_indices[i];\n        global_min_distance = x[closest_indices[i]];\n      }\n    }\n    return global_closest_index;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  size_t closest_index = 0;\n  double closest_distance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closest_distance) {\n      closest_distance = distance;\n      closest_index = i;\n    }\n  }\n\n  return closest_index;\n}\n",
+            "\n\n  size_t closestIndex = 0;\n  double closestDistance = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < x.size(); i++) {\n    double distance = std::abs(x[i] - M_PI);\n    if (distance < closestDistance) {\n      closestIndex = i;\n      closestDistance = distance;\n    }\n  }\n\n  return closestIndex;\n}\n",
+            "\n\n  size_t closest_idx = 0;\n  double min_diff = std::abs(x[0] - M_PI);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    double diff = std::abs(x[i] - M_PI);\n    if (diff < min_diff) {\n      #pragma omp critical\n      {\n        if (diff < min_diff) {\n          closest_idx = i;\n          min_diff = diff;\n        }\n      }\n    }\n  }\n\n  return closest_idx;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(Kokkos::View<Kokkos::complex<double>*> const& x, Kokkos::View<double*> &r, Kokkos::View<double*> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Compute the Fourier transform of x\n  auto y = fft_transform(x);\n\n  // Extract the real and imaginary parts of the result\n  r = real(y);\n  i = imag(y);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) + x(k).imag() * sin(angle);\n      imag += x(k).imag() * cos(angle) - x(k).real() * sin(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the FFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2.0 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0.0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      real += x(j).real() * cos(angle) - x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int& n) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double angle = 2 * M_PI * k * n / x.extent(0);\n      real += x(k).real() * cos(angle) - x(k).imag() * sin(angle);\n      imag += x(k).real() * sin(angle) + x(k).imag() * cos(angle);\n    }\n    r(n) = real;\n    i(n) = imag;\n  });\n}\n",
+            "\n\n  // Compute the DFT of x\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * i * j / x.extent(0);\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& n) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int k = 0; k < x.extent(0); ++k) {\n      double theta = 2 * M_PI * k * n / x.extent(0);\n      re += x(k).real() * cos(theta) + x(k).imag() * sin(theta);\n      im += x(k).real() * sin(theta) - x(k).imag() * cos(theta);\n    }\n    r(n) = re;\n    i(n) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double re = 0.0;\n    double im = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      re += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      im += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = re;\n    i(i) = im;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n\n  // Create a plan for the FFT\n  FFTPlan<double> plan(n);\n\n  // Create a view of the input and output arrays\n  View<double*> x_real = create_view<double*>(x, 0);\n  View<double*> x_imag = create_view<double*>(x, 1);\n  View<double*> r_view = create_view<double*>(r);\n  View<double*> i_view = create_view<double*>(i);\n\n  // Execute the FFT\n  plan.execute(x_real, x_imag, r_view, i_view);\n\n  // Normalize the output\n  r_view /= n;\n  i_view /= n;\n}\n",
+            "\nKokkos::parallel_for(Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA (const int i) {\n    r(i) = 0;\n    i(i) = 0;\n    for (int j = 0; j < x.extent(0); j++) {\n        double angle = 2 * M_PI * j * i / x.extent(0);\n        r(i) += x(j) * cos(angle);\n        i(i) -= x(j) * sin(angle);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += -x(j).real() * sin(angle) + x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<double, Kokkos::LayoutLeft, Kokkos::DeviceType::GPU>(x.extent(0), true);\n\n  // Create a workspace for the FFT\n  auto workspace = plan.get_workspace();\n\n  // Execute the FFT\n  plan.execute(x, workspace);\n\n  // Extract the real and imaginary parts of the results\n  r = plan.get_output_real();\n  i = plan.get_output_imag();\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  // Create a team policy for parallel execution\n  TeamPolicy<Kokkos::DefaultExecutionSpace> policy(x.extent(0));\n\n  // Define the kernel lambda to compute the FFT\n  KernelFunctor<policy> kernel;\n  kernel.template run(policy, [=](const int& i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sum += x(j) * std::exp(-std::complex<double>(0, angle));\n    }\n    r(i) = sum.real();\n    i(i) = sum.imag();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int& i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < x.extent(0); ++j) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = real;\n    i(i) = imag;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fft\", x.extent(0), KOKKOS_LAMBDA (const int& i) {\n    double sumr = 0.0;\n    double sumi = 0.0;\n    for (int j = 0; j < x.extent(0); j++) {\n      double angle = 2 * M_PI * j * i / x.extent(0);\n      sumr += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      sumi += x(j).real() * sin(angle) - x(j).imag() * cos(angle);\n    }\n    r(i) = sumr;\n    i(i) = sumi;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) - x[t].imag() * sin(angle);\n      sumi += x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int N = x.size();\n  r.resize(N);\n  i.resize(N);\n\n  #pragma omp parallel for\n  for (int k = 0; k < N; ++k) {\n    std::complex<double> sum = 0;\n    for (int n = 0; n < N; ++n) {\n      double angle = 2 * M_PI * n * k / N;\n      sum += x[n] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the fourier transform of x.\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (size_t j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-std::complex<double>(0, 2 * M_PI * k * t / n));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int num_threads = omp_get_max_threads();\n  int chunk_size = n / num_threads;\n\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    int start = tid * chunk_size;\n    int end = start + chunk_size;\n    if (tid == num_threads - 1) {\n      end = n;\n    }\n\n    for (int j = start; j < end; j++) {\n      double real = 0.0;\n      double imag = 0.0;\n      for (int k = 0; k < n; k++) {\n        double angle = 2 * M_PI * j * k / n;\n        real += x[k].real() * cos(angle) + x[k].imag() * sin(angle);\n        imag += -x[k].real() * sin(angle) + x[k].imag() * cos(angle);\n      }\n      r[j] = real;\n      i[j] = imag;\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * t * k / n);\n      sum += x[t] * omega;\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * k * t / n);\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::exp(std::complex<double>(0, -angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::exp(-std::complex<double>(0, angle));\n    }\n    r[k] = sum.real();\n    i[k] = sum.imag();\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumr += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumi += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumreal = 0, sumimag = 0;\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sumreal += x[t].real() * cos(angle) + x[t].imag() * sin(angle);\n      sumimag += -x[t].real() * sin(angle) + x[t].imag() * cos(angle);\n    }\n    r[k] = sumreal;\n    i[k] = sumimag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int k = 0; k < n / 2; k++) {\n    std::complex<double> t = w * odd[k];\n    r[k] = r[k] + t.real();\n    i[k] = i[k] + t.imag();\n    r[k + n / 2] = r[k] - t.real();\n    i[k + n / 2] = i[k] - t.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int k = 0; k < n / 2; k++) {\n    even[k] = x[2 * k];\n    odd[k] = x[2 * k + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double pi = std::acos(-1);\n  for (int k = 0; k < n / 2; k++) {\n    double t = r[k];\n    r[k] = t + std::cos(2 * pi * k / n) * r[k + n / 2] - std::sin(2 * pi * k / n) * i[k + n / 2];\n    i[k] = t - std::cos(2 * pi * k / n) * r[k + n / 2] + std::sin(2 * pi * k / n) * i[k + n / 2];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 0) {\n    return;\n  }\n  r.resize(n);\n  i.resize(n);\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = real;\n    i[k] = imag;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "fft, 4,",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double real = 0;\n    double imag = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      real += x[j].real() * cos(angle) - x[j].imag() * sin(angle);\n      imag += x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r.push_back(real);\n    i.push_back(imag);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    r[0] = x[0].real();\n    i[0] = x[0].imag();\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int j = 0; j < n / 2; j++) {\n    even[j] = x[2 * j];\n    odd[j] = x[2 * j + 1];\n  }\n  fft(even, r, i);\n  fft(odd, r, i);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int j = 0; j < n / 2; j++) {\n    std::complex<double> u = r[j] + w * i[j];\n    std::complex<double> v = r[j] - w * i[j];\n    r[j] = u.real();\n    i[j] = u.imag();\n    r[j + n / 2] = v.real();\n    i[j + n / 2] = v.imag();\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    double sumr = 0, sumi = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sumr += x[j].real() * cos(angle) + x[j].imag() * sin(angle);\n      sumi += -x[j].real() * sin(angle) + x[j].imag() * cos(angle);\n    }\n    r[k] = sumr;\n    i[k] = sumi;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  for (int k = 0; k < n; ++k) {\n    for (int j = 0; j < n; ++j) {\n      X[k] += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n  }\n  for (int i = 0; i < n; ++i) {\n    r[i] = X[i].real();\n    i[i] = X[i].imag();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const hipDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      hipDoubleComplex w = {cos(angle), sin(angle)};\n      sum += x[j] * w;\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipCreateDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex rot = hipCreateDoubleComplex(cos(angle), sin(angle));\n      sum = hipCadd(sum, hipCmul(x[j], rot));\n    }\n    r[i] = hipCreal(sum);\n    i[i] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; k++) {\n    double arg = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(arg), sin(arg)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft:",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = 2 * M_PI * j * i / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t N2 = N / 2;\n  if (tid < N2) {\n    r[tid] = x[tid].x + x[tid + N2].x;\n    i[tid] = x[tid].y + x[tid + N2].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * gid / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[j] * w;\n  }\n  r[i] = sum.x;\n  i[i] = sum.y;\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    r[id] = x[id].real();\n    i[id] = x[id].imag();\n  }\n  __syncbarriers();\n  for (size_t n = 2; n <= N; n *= 2) {\n    double a = 2 * M_PI / n;\n    for (size_t k = 0; k < N; k += n) {\n      double c = cos(a * k);\n      double s = sin(a * k);\n      for (size_t j = 0; j < n / 2; j++) {\n        size_t p = k + j;\n        size_t q = p + n / 2;\n        double t = r[q] * c - i[q] * s;\n        i[q] = r[q] * s + i[q] * c;\n        r[q] = r[p] - t;\n        i[p] = i[p] - t;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i / N;\n      hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t i = tid + bid * hipBlockDim_x;\n  if (i < N) {\n    r[i] = x[i].x;\n    i[i] = x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    r[gid] = x[gid].x;\n    i[gid] = x[gid].y;\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * gid * j / N});\n      sum += x[j] * w;\n    }\n    r[gid] = hipCreal(sum);\n    i[gid] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx >= N) return;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[idx] = sum.x;\n  i[idx] = sum.y;\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * j * k / N));\n      sum += x[j] * w;\n    }\n    r[k] = hipCreal(sum);\n    i[k] = hipCimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x;\n  size_t bid = hipBlockIdx_x;\n  size_t gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    hipDoubleComplex sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    r[gid] = sum.x;\n    i[gid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n  hipDoubleComplex sum = 0;\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    hipDoubleComplex w = hipMakeDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  // Compute the fourier transform of x.\n  hipDoubleComplex *X = hipMalloc<hipDoubleComplex>(N);\n  hipMemcpy(X, x, N * sizeof(hipDoubleComplex), hipMemcpyHostToDevice);\n  hipfftPlan plan;\n  hipfftPlanCreate(&plan, N, HIPFFT_C2C, HIPFFT_FORWARD);\n  hipfftExecC2C(plan, X, X, HIPFFT_FORWARD);\n  hipfftDestroyPlan(plan);\n  hipMemcpy(r, X, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipMemcpy(i, X + 1, N * sizeof(double), hipMemcpyDeviceToHost);\n  hipFree(X);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  int stride = hipBlockDim_x * hipGridDim_x;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * k * gid / N;\n    hipDoubleComplex w = {cos(angle), sin(angle)};\n    sum += x[k] * w;\n  }\n  r[gid] = sum.x;\n  i[gid] = sum.y;\n}\n",
+            "fft,"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT of z on rank 0\n  if (rank == 0) {\n    fft_local(z, y);\n  }\n\n  // Scatter the results to all ranks\n  MPI_Scatter(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the results\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Send the DFT of x to rank 0\n  MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &z[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse DFT of z\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      for (int j = 0; j < n; j++) {\n        x[k] += z[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n      }\n      x[k] /= n;\n      r[k] = x[k].real();\n      i[k] = x[k].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global result\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<double> r_local(n), i_local(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y, r_local, i_local);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = r_local[i];\n      i[i] = i_local[i];\n    }\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n), i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_recv[j];\n        i[j] += i_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(r_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i_local.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fft_local(x, local_fft);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_fft.data(), n, MPI_DOUBLE, local_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = local_fft[j].real();\n    i[j] = local_fft[j].imag();\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n);\n  std::vector<std::complex<double>> local_r(local_n), local_i(local_n);\n  fft_recursive(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n  std::vector<std::complex<double>> Z(n);\n\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n\n  MPI_Alltoall(X.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    Z[j] = Y[j] * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * j / n);\n  }\n\n  MPI_Alltoall(Z.data(), 1, MPI_DOUBLE_INT, Y.data(), 1, MPI_DOUBLE_INT, MPI_COMM_WORLD);\n\n  for (int j = 0; j < n; j++) {\n    std::complex<double> sum = std::complex<double>(0.0, 0.0);\n    for (int k = 0; k < n; k++) {\n      sum += Y[k] * std::exp(2.0 * M_PI * std::complex<double>(0.0, 1.0) * j * k / n);\n    }\n    X[j] = sum;\n  }\n\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Status status;\n      MPI_Request request;\n      MPI_Irecv(r_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      MPI_Irecv(i_recv.data(), n, MPI_DOUBLE, j, 0, MPI_COMM_WORLD, &request);\n      MPI_Wait(&request, &status);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_all.size());\n    i = std::vector<double>(i_all.size());\n    for (size_t j = 0; j < r_all.size(); ++j) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the FFT of x on each rank\n  fft_local(x, y);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < size; i++) {\n      std::vector<std::complex<double>> y_i(n);\n      MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, &y_i[0], n, MPI_DOUBLE_COMPLEX, i, MPI_COMM_WORLD);\n      for (int j = 0; j < n; j++) {\n        r[j] += y_i[j].real();\n        i[j] += y_i[j].imag();\n      }\n    }\n  } else {\n    MPI_Gather(&y[0], n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  if (n != (1 << (int)log2(n))) {\n    throw std::invalid_argument(\"x must have a power of 2 length\");\n  }\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (int s = 1; s <= (int)log2(n); s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank, x.begin() + rank + x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_impl(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(n);\n  std::vector<std::complex<double>> global_i(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, global_r.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, global_i.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of the transform\n  std::vector<std::complex<double>> local_r(n), local_i(n);\n  for (int j = 0; j < n; ++j) {\n    std::complex<double> sum_r(0, 0), sum_i(0, 0);\n    for (int k = 0; k < n; ++k) {\n      double angle = -2 * M_PI * j * k / n;\n      sum_r += std::exp(std::complex<double>(0, angle)) * x[k];\n      sum_i += std::exp(std::complex<double>(0, angle)) * x[k] * std::complex<double>(0, 1);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  // Gather the local transforms into a global array\n  std::vector<std::complex<double>> global_r(n), global_i(n);\n  MPI_Allgather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Extract the real and imaginary parts of the global transform\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int j = 0; j < n; ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * n / size, x.begin() + (rank + 1) * n / size);\n  std::vector<std::complex<double>> local_r(n / size), local_i(n / size);\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(n), i_gathered(n);\n  MPI_Gather(local_r.data(), n / size, MPI_DOUBLE, r_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n / size, MPI_DOUBLE, i_gathered.data(), n / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(n);\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  for (int j = 0; j < n; ++j) {\n    local_x[j] = x[j];\n  }\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(n * size);\n  std::vector<std::complex<double>> i_gathered(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, r_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, i_gathered.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    for (int j = 0; j < n; ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = i_gathered[j].real();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), local_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), local_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].real();\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\n__global__ void fft(const cuDoubleComplex *x, double *r, double *i, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= N) {\n    return;\n  }\n  double sum_r = 0.0;\n  double sum_i = 0.0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * idx / N;\n    sum_r += x[k].x * cos(angle) + x[k].y * sin(angle);\n    sum_i += -x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[idx] = sum_r;\n  i[idx] = sum_i;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t k = blockIdx.x * blockDim.x + threadIdx.x;\n  if (k < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * k * j / N;\n      cuDoubleComplex rot = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * rot;\n    }\n    r[k] = creal(sum);\n    i[k] = cimag(sum);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n  // Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n  // Example:\n\n  // input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n  // output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * gid * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[gid] = creal(sum);\n    i[gid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = block.thread.x + block.block.x * block.grid.x;\n  if (tid >= N) {\n    return;\n  }\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * tid * k / N;\n    re += x[k] * cos(angle);\n    im += x[k] * sin(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i0 * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n",
+            "\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  int stride = blockDim.x * gridDim.x;\n  for (int k = tid; k < N; k += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * k / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    r[k] = sum.x;\n    i[k] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * k * tid / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx >= N) return;\n\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t k = 0; k < N; k++) {\n    double angle = -2 * M_PI * idx * k / N;\n    cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[k] * w;\n  }\n  r[idx] = creal(sum);\n  i[idx] = cimag(sum);\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = sum.x;\n    i[tid] = sum.y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  size_t offset = tid % N;\n  double sumr = 0, sumi = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * offset / N;\n    sumr += real(x[k]) * cos(angle) + imag(x[k]) * sin(angle);\n    sumi += -real(x[k]) * sin(angle) + imag(x[k]) * cos(angle);\n  }\n  r[tid] = sumr;\n  i[tid] = sumi;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i = tid + bid * blockDim.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    r[i] = creal(sum);\n    i[i] = cimag(sum);\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid >= N) return;\n\n  double re = 0, im = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = -2 * M_PI * tid * k / N;\n    re += x[k].x * cos(angle) - x[k].y * sin(angle);\n    im += x[k].x * sin(angle) + x[k].y * cos(angle);\n  }\n  r[tid] = re;\n  i[tid] = im;\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * tid * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[tid] = creal(sum);\n    i[tid] = cimag(sum);\n  }\n}\n",
+            "\n  size_t tid = threadIdx.x;\n  size_t bid = blockIdx.x;\n  size_t i0 = tid + bid * blockDim.x;\n  if (i0 < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * i0 / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    r[i0] = creal(sum);\n    i[i0] = cimag(sum);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "08_fft_split_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<double> &r, std::vector<double> &i) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> x_local(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> y_local(n_local);\n  std::vector<std::complex<double>> y(n);\n\n  // Compute the DFT of the local data\n  fft_local(x_local, y_local);\n\n  // Gather the results from all ranks\n  MPI_Allgather(y_local.data(), n_local, MPI_DOUBLE_COMPLEX, y.data(), n_local, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Compute the global DFT\n  fft_global(y, n);\n\n  // Extract the real and imaginary parts of the result\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = y[j].real();\n    i[j] = y[j].imag();\n  }\n\n  // Send the result to rank 0\n  if (rank != 0) {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    // Receive the results from all ranks\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(r.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_fft(n);\n  fftw_plan plan = fftw_plan_dft_1d(n, reinterpret_cast<fftw_complex*>(&x[0]), reinterpret_cast<fftw_complex*>(&local_fft[0]), FFTW_FORWARD, FFTW_ESTIMATE);\n  fftw_execute(plan);\n  fftw_destroy_plan(plan);\n\n  // Gather local FFT results\n  std::vector<std::complex<double>> global_fft(n * size);\n  MPI_Gather(local_fft.data(), n, MPI_DOUBLE, global_fft.data(), n, MPI_DOUBLE, 0);\n\n  // Compute global FFT\n  fftw_plan plan2 = fftw_plan_dft_1d(n * size, reinterpret_cast<fftw_complex*>(&global_fft[0]), reinterpret_cast<fftw_complex*>(&global_fft[0]), FFTW_BACKWARD, FFTW_ESTIMATE);\n  fftw_execute(plan2);\n  fftw_destroy_plan(plan2);\n\n  // Extract real and imaginary parts\n  r.resize(n);\n  i.resize(n);\n  for (int j = 0; j < n; j++) {\n    r[j] = real(global_fft[j] / n);\n    i[j] = imag(global_fft[j] / n);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  fft_serial(local_x, local_r, local_i);\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_local_padded = n_local * 2;\n\n  std::vector<std::complex<double>> x_local(n_local);\n  std::vector<std::complex<double>> x_local_padded(n_local_padded);\n  std::vector<std::complex<double>> y_local(n_local_padded);\n  std::vector<std::complex<double>> y(n);\n\n  MPI_Scatter(x.data(), n_local, MPI_DOUBLE_COMPLEX, x_local.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Pad with zeros\n  for (int i = 0; i < n_local; i++) {\n    x_local_padded[i] = x_local[i];\n    x_local_padded[i + n_local] = 0;\n  }\n\n  // Compute FFT in parallel\n  fft_recursive(x_local_padded, y_local);\n\n  // Gather results\n  MPI_Gather(y_local.data(), n_local_padded, MPI_DOUBLE_COMPLEX, y.data(), n_local_padded, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(n);\n    i.resize(n);\n    for (int i = 0; i < n; i++) {\n      r[i] = y[i].real();\n      i[i] = y[i].imag();\n    }\n  }\n}\n",
+            "\n\n  // Compute the FFT of x.\n  std::vector<std::complex<double>> X(x);\n  fft_mpi(X);\n\n  // Store the real and imaginary parts of the FFT in r and i.\n  r.resize(X.size());\n  i.resize(X.size());\n  for (size_t j = 0; j < X.size(); ++j) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<double> r_all(X.size() * MPI::COMM_WORLD.Get_size());\n    std::vector<double> i_all(X.size() * MPI::COMM_WORLD.Get_size());\n    MPI::COMM_WORLD.Gather(&r[0], X.size(), MPI::DOUBLE, &r_all[0], X.size(), MPI::DOUBLE, 0);\n    MPI::COMM_WORLD.Gather(&i[0], X.size(), MPI::DOUBLE, &i_all[0], X.size(), MPI::DOUBLE, 0);\n    r = r_all;\n    i = i_all;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_all(x.size());\n  std::vector<std::complex<double>> i_all(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_all.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_all.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); j++) {\n      r[j] = r_all[j].real();\n      i[j] = r_all[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    X[k] = sum;\n  }\n\n  // Compute the inverse DFT of X\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = std::complex<double>(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += X[j] * std::exp(2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    Y[k] = sum / n;\n  }\n\n  // Store the real and imaginary parts of Y in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = Y[k].real();\n    i[k] = Y[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(&r_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(&i_recv[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(&r[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(&i[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      y[k] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j] += y[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int j = 0; j < n; j++) {\n    r[j] = z[j].real();\n    i[j] = z[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_i(n);\n      std::vector<double> i_i(n);\n      MPI_Recv(r_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_i.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        r[j] += r_i[j];\n        i[j] += i_i[j];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  r.resize(n);\n  i.resize(n);\n\n  // Compute local FFTs\n  std::vector<std::complex<double>> local_r(n);\n  std::vector<std::complex<double>> local_i(n);\n  fft_local(x, local_r, local_i);\n\n  // Gather local FFTs\n  std::vector<std::complex<double>> global_r(n * size);\n  std::vector<std::complex<double>> global_i(n * size);\n  MPI_Gather(local_r.data(), n, MPI_DOUBLE_COMPLEX, global_r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n, MPI_DOUBLE_COMPLEX, global_i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute global FFT\n  if (rank == 0) {\n    fft_global(global_r, global_i);\n  }\n\n  // Scatter global FFT\n  MPI_Scatter(global_r.data(), n, MPI_DOUBLE_COMPLEX, r.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  MPI_Scatter(global_i.data(), n, MPI_DOUBLE_COMPLEX, i.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather the local results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local partial sums\n  std::vector<std::complex<double>> local_r(n, 0.0);\n  std::vector<std::complex<double>> local_i(n, 0.0);\n  for (int k = 0; k < n; k++) {\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * M_PI * j * k / n;\n      local_r[k] += x[j] * std::cos(angle);\n      local_i[k] += x[j] * std::sin(angle);\n    }\n  }\n\n  // Reduce partial sums to rank 0\n  std::vector<std::complex<double>> global_r(n, 0.0);\n  std::vector<std::complex<double>> global_i(n, 0.0);\n  MPI_Reduce(local_r.data(), n, MPI_DOUBLE, global_r.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Reduce(local_i.data(), n, MPI_DOUBLE, global_i.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts on rank 0\n  if (rank == 0) {\n    for (int k = 0; k < n; k++) {\n      r[k] = global_r[k].real();\n      i[k] = global_i[k].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> X(n);\n  std::vector<std::complex<double>> Y(n);\n\n  // Compute the FFT of x\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    X[j] = x[j];\n  }\n  fft(X);\n\n  // Compute the real and imaginary parts of the FFT\n  #pragma opm parallel for\n  for (int j = 0; j < n; j++) {\n    r[j] = X[j].real();\n    i[j] = X[j].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int j = 1; j < size; j++) {\n      std::vector<double> r_j(n);\n      std::vector<double> i_j(n);\n      MPI_Gather(&r[0], n, MPI_DOUBLE, &r_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      MPI_Gather(&i[0], n, MPI_DOUBLE, &i_j[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_j[k];\n        i[k] += i_j[k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> global_r(x.size());\n  std::vector<std::complex<double>> global_i(x.size());\n  MPI_Allgather(local_r.data(), local_r.size(), MPI_DOUBLE, global_r.data(), global_r.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n  MPI_Allgather(local_i.data(), local_i.size(), MPI_DOUBLE, global_i.data(), global_i.size(), MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = global_r[j].real();\n      i[j] = global_i[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_r(n_local);\n  std::vector<std::complex<double>> local_i(n_local);\n\n  // Compute the local FFT\n  fft_local(local_x, local_r, local_i);\n\n  // Gather the results\n  std::vector<std::complex<double>> r_gathered(n);\n  std::vector<std::complex<double>> i_gathered(n);\n  MPI_Gather(local_r.data(), n_local, MPI_DOUBLE, r_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), n_local, MPI_DOUBLE, i_gathered.data(), n_local, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store the results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(n);\n    i = std::vector<double>(n);\n    for (int j = 0; j < n; j++) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_local(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Store results on rank 0\n  if (rank == 0) {\n    r = std::vector<double>(r_gathered.size());\n    i = std::vector<double>(i_gathered.size());\n    for (size_t j = 0; j < r_gathered.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_r(local_n);\n  std::vector<std::complex<double>> local_i(local_n);\n\n  std::copy(x.begin() + local_start, x.begin() + local_end, local_x.begin());\n\n  #pragma omp parallel for\n  for (int j = 0; j < local_n; j++) {\n    double sum_r = 0.0;\n    double sum_i = 0.0;\n    for (int k = 0; k < n; k++) {\n      double theta = 2 * M_PI * k / n;\n      sum_r += x[k] * cos(theta * j);\n      sum_i += x[k] * sin(theta * j);\n    }\n    local_r[j] = sum_r;\n    local_i[j] = sum_i;\n  }\n\n  MPI_Gather(local_r.data(), local_n, MPI_DOUBLE, r.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_n, MPI_DOUBLE, i.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local FFT\n  std::vector<std::complex<double>> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::vector<std::complex<double>> local_r(local_x.size());\n  std::vector<std::complex<double>> local_i(local_x.size());\n  fft_serial(local_x, local_r, local_i);\n\n  // Gather results\n  std::vector<std::complex<double>> r_gathered(x.size());\n  std::vector<std::complex<double>> i_gathered(x.size());\n  MPI_Gather(local_r.data(), local_r.size(), MPI_DOUBLE, r_gathered.data(), local_r.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Gather(local_i.data(), local_i.size(), MPI_DOUBLE, i_gathered.data(), local_i.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Extract real and imaginary parts\n  if (rank == 0) {\n    r.resize(x.size());\n    i.resize(x.size());\n    for (size_t j = 0; j < x.size(); ++j) {\n      r[j] = r_gathered[j].real();\n      i[j] = r_gathered[j].imag();\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the DFT of x\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Compute the DFT of z\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += z[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    y[k] = sum;\n  }\n\n  // Transpose y\n  for (int j = 0; j < n; j++) {\n    for (int k = 0; k < n; k++) {\n      z[j * n + k] = y[k * n + j];\n    }\n  }\n\n  // Store the real and imaginary parts of the DFT in r and i\n  for (int k = 0; k < n; k++) {\n    r[k] = z[k].real();\n    i[k] = z[k].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> r_recv(n);\n      std::vector<double> i_recv(n);\n      MPI_Recv(r_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      MPI_Recv(i_recv.data(), n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int k = 0; k < n; k++) {\n        r[k] += r_recv[k];\n        i[k] += i_recv[k];\n      }\n    }\n  } else {\n    MPI_Send(r.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(i.data(), n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const size_t n = x.extent(0);\n  const size_t n_half = n / 2;\n\n  // Compute the FFT of the first half of x.\n  auto x_half = x.slice(0, n_half);\n  fft(x_half);\n\n  // Compute the FFT of the second half of x.\n  auto x_half_plus = x.slice(n_half, n_half);\n  fft(x_half_plus);\n\n  // Compute the imaginary conjugate of the second half of x.\n  for (size_t i = 0; i < n_half; ++i) {\n    x_half_plus(i) = conj(x_half_plus(i));\n  }\n\n  // Swap the first and second halves of x.\n  auto temp = x_half;\n  x_half = x_half_plus;\n  x_half_plus = temp;\n\n  // Compute the inverse FFT of x.\n  fft(x, Inverse);\n\n  // Normalize the FFT by dividing by n.\n  for (size_t i = 0; i < n; ++i) {\n    x(i) /= n;\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(*x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"fftConjugate\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n  std::vector<std::complex<double>> w(n);\n  std::vector<std::complex<double>> u(n);\n  std::vector<std::complex<double>> t(n);\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n\n  for (size_t i = 0; i < n; i++) {\n    for (size_t j = 0; j < n; j++) {\n      w[j] = std::exp(std::complex<double>(0, -2 * M_PI * i * j / n));\n    }\n    for (size_t j = 0; j < n; j++) {\n      z[j] = y[j] * w[j];\n    }\n    for (size_t j = 0; j < n; j++) {\n      u[j] = z[j];\n    }\n    for (size_t j = 0; j < n / 2; j++) {\n      t[j] = u[j] + std::conj(u[n - j - 1]);\n      t[n - j - 1] = std::complex<double>(0, 1) * (u[j] - std::conj(u[n - j - 1]));\n    }\n    for (size_t j = 0; j < n; j++) {\n      y[j] = t[j];\n    }\n  }\n\n#pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = y[i];\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // Compute the fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int size = 1 << log_n;\n  std::vector<std::complex<double>> y(size);\n  for (int i = 0; i < n; i++) {\n    y[i] = x[i];\n  }\n  for (int i = n; i < size; i++) {\n    y[i] = 0;\n  }\n  std::vector<std::complex<double>> roots(size / 2);\n  for (int i = 0; i < size / 2; i++) {\n    double angle = 2 * M_PI * i / size;\n    roots[i] = std::complex<double>(cos(angle), sin(angle));\n  }\n  for (int i = 0; i < log_n; i++) {\n    int step = 1 << i;\n    int half_step = step / 2;\n    for (int j = 0; j < size; j += step) {\n      for (int k = 0; k < half_step; k++) {\n        std::complex<double> u = y[j + k];\n        std::complex<double> v = y[j + k + half_step] * roots[size / (2 * step) * k];\n        y[j + k] = u + v;\n        y[j + k + half_step] = u - v;\n      }\n    }\n  }\n  for (int i = 0; i < size; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int numThreads = omp_get_max_threads();\n  const int chunkSize = n / numThreads;\n\n  #pragma omp parallel\n  {\n    #pragma omp for schedule(static, chunkSize)\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int lg = __lg(n);\n  int i0 = 0;\n  for (int l = 2; l <= n; l <<= 1) {\n    for (int i = 0; i < n; i += l) {\n      for (int j = 0; j < l / 2; j++) {\n        std::complex<double> u = x[i0 + j];\n        std::complex<double> v = x[i0 + j + l / 2];\n        x[i0 + j] = u + v;\n        x[i0 + j + l / 2] = std::complex<double>(u.real() - v.real(), -v.imag());\n      }\n      i0 += l;\n    }\n    i0 = 0;\n  }\n  for (int i = 0; i < n; i++) {\n    int j = __builtin_reverse(i, lg);\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the Fourier transform of x in-place.\n  fft(x);\n\n  // Compute the imaginary conjugate of each value.\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the conjugate values back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n#pragma omp parallel for\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n  const int n_threads_per_group = 4;\n  const int n_groups = n_threads / n_threads_per_group;\n\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  #pragma omp parallel num_threads(n_threads)\n  {\n    const int tid = omp_get_thread_num();\n    const int group_id = tid / n_threads_per_group;\n    const int local_id = tid % n_threads_per_group;\n\n    for (int i = 0; i < n; i++) {\n      int rev = 0;\n      for (int j = 0; j < log2n; j++) {\n        rev |= ((i >> j) & 1) << (log2n - j - 1);\n      }\n      y[rev] = x[i];\n    }\n\n    for (int s = 1; s <= log2n; s++) {\n      const int m = 1 << s;\n      const double omega = 2 * M_PI / m;\n      const std::complex<double> w(cos(omega), sin(omega));\n\n      #pragma omp for schedule(static)\n      for (int k = 0; k < n; k += m) {\n        std::complex<double> wn(1, 0);\n        for (int j = 0; j < m / 2; j++) {\n          const std::complex<double> t1 = y[k + j];\n          const std::complex<double> t2 = wn * y[k + j + m / 2];\n          y[k + j] = t1 + t2;\n          y[k + j + m / 2] = t1 - t2;\n          wn *= w;\n        }\n      }\n    }\n\n    #pragma omp for schedule(static)\n    for (int i = 0; i < n; i++) {\n      z[i] = std::conj(y[i]);\n    }\n  }\n\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int n_half = n / 2;\n  const double pi = 3.14159265358979323846;\n\n  // Reverse-bit permutation\n  for (int i = 0; i < n; i++) {\n    int j = 0;\n    for (int k = 0; k < n_half; k++) {\n      if (i & (1 << k)) {\n        j |= (1 << (n_half - k - 1));\n      }\n    }\n    if (i < j) {\n      std::swap(x[i], x[j]);\n    }\n  }\n\n  // Compute the FFT in-place\n  for (int s = 1; s <= n_half; s *= 2) {\n    const double theta = pi / s;\n    const std::complex<double> wn(std::cos(theta), std::sin(theta));\n    for (int i = 0; i < n; i += 2 * s) {\n      std::complex<double> w(1, 0);\n      for (int j = 0; j < s; j++) {\n        const std::complex<double> u = x[i + j];\n        const std::complex<double> v = x[i + j + s] * w;\n        x[i + j] = u + v;\n        x[i + j + s] = u - v;\n        w *= wn;\n      }\n    }\n  }\n\n  // Conjugate the imaginary part of each value\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> z(n);\n\n  // Compute the FFT of x\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value in y\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = std::conj(y[i]);\n  }\n\n  // Copy the values of z back into x\n  x = z;\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int nthreads = omp_get_max_threads();\n  const int chunksize = n / nthreads;\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const double wn = std::polar(1.0, -2.0 * M_PI / m);\n    const double w = 1.0;\n\n    #pragma omp parallel for schedule(static, chunksize)\n    for (int j = 0; j < n; j += m) {\n      double wr = w;\n      double wi = 0.0;\n      for (int k = 0; k < m / 2; k++) {\n        const std::complex<double> t = x[j + k + m / 2] * std::complex<double>(wr, wi);\n        x[j + k + m / 2] = x[j + k] - t;\n        x[j + k] += t;\n        const double newwr = wr * w.real() - wi * w.imag();\n        const double newwi = wr * w.imag() + wi * w.real();\n        wr = newwr;\n        wi = newwi;\n      }\n    }\n  }\n\n  #pragma omp parallel for schedule(static, chunksize)\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::vector<std::complex<double>> w(n);\n  for (int i = 0; i < n; i++) {\n    w[i] = std::polar(1.0, -2 * M_PI * i / n);\n  }\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * w[j * k % n];\n    }\n    y[k] = sum;\n  }\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Take the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int log2n = __builtin_ctz(n);\n  const int n_threads = omp_get_max_threads();\n\n  // Reverse bits\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log2n; j++) {\n      rev |= ((i >> j) & 1) << (log2n - j - 1);\n    }\n    if (i < rev) {\n      std::swap(x[i], x[rev]);\n    }\n  }\n\n  // Compute the FFT\n  for (int s = 1; s <= log2n; s++) {\n    const int m = 1 << s;\n    const int m2 = m >> 1;\n    const double theta = 2 * M_PI / m;\n    std::complex<double> w(1, 0);\n    std::complex<double> wn(std::cos(theta), std::sin(theta));\n\n    #pragma omp parallel for schedule(static)\n    for (int k = 0; k < n; k += m) {\n      for (int j = 0; j < m2; j++) {\n        std::complex<double> t = w * x[k + j + m2];\n        x[k + j + m2] = x[k + j] - t;\n        x[k + j] += t;\n      }\n      w *= wn;\n    }\n  }\n\n  // Conjugate the imaginary part\n  #pragma omp parallel for schedule(static)\n  for (int i = 0; i < n; i++) {\n    x[i].imag(-x[i].imag());\n  }\n}\n",
+            "\n\n  // Compute the FFT in-place\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  const int num_threads = omp_get_max_threads();\n  const int chunk_size = n / num_threads;\n\n  #pragma omp parallel for schedule(static, chunk_size)\n  for (int i = 0; i < n; ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::polar(1.0, ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i][1] = -x[i][1];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> tmp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(tmp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    return;\n  }\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  fft(y);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(y[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) return;\n  std::vector<std::complex<double>> even = x;\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fftConjugate(even);\n  fftConjugate(odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    x[i] = even[i] + w * odd[i];\n    x[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  const size_t n = x.size();\n  if (n <= 1) return;\n\n  std::vector<std::complex<double>> even = x[std::slice(0, n / 2, 2)];\n  std::vector<std::complex<double>> odd = x[std::slice(1, n / 2, 2)];\n\n  fftConjugate(even);\n  fftConjugate(odd);\n\n  for (size_t i = 0; i < n / 2; ++i) {\n    std::complex<double> t = std::exp(std::complex<double>(0, -2 * M_PI * i / n)) * odd[i];\n    x[i] = even[i] + t;\n    x[i + n / 2] = even[i] - t;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n / 2; i++) {\n    std::complex<double> temp = x[i];\n    x[i] = std::conj(x[n - i - 1]);\n    x[n - i - 1] = std::conj(temp);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx.x;\n  if (i < size) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i] = hipConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    x[i].x = x[N - 1 - i].x;\n    x[i].y = -x[N - 1 - i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); ++i) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), y_recv.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y_recv.size(); ++j) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int log2n = 0;\n  while ((1 << log2n) < n) log2n++;\n\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) y[i] = x[i];\n\n  for (int s = 1; s <= log2n; s++) {\n    int m = 1 << s;\n    int m2 = m / 2;\n    double theta = 2 * M_PI / m;\n    std::complex<double> w(cos(theta), sin(theta));\n    std::complex<double> wn(1, 0);\n    for (int j = 0; j < m2; j++) {\n      for (int k = j; k < n; k += m) {\n        std::complex<double> t = wn * y[k + m2];\n        y[k + m2] = y[k] - t;\n        y[k] = y[k] + t;\n      }\n      wn = wn * w;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 0; i < n; i++) {\n      x[i] = std::conj(y[i]);\n    }\n  }\n}\n",
+            ".",
+            "\n\n  size_t n = x.size();\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n  std::sort(x.begin(), x.end(), [](std::compelex<double> a, std::compelex<double> b) { return std::im(a) < std::im(b); });\n  std::transform(x.begin(), x.end(), x.begin(), [](std::compelex<double> z) { return std::compelex<double>(z.re, -z.im); });\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft(global_fft, x);\n  }\n\n  // Scatter the global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the conjugate of the local FFT\n  for (size_t i = 0; i < local_fft.size(); i++) {\n    local_fft[i] = std::conj(local_fft[i]);\n  }\n\n  // Gather the conjugate FFTs\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n\n  // Compute the inverse global FFT\n  if (rank == 0) {\n    ifft(global_fft, x);\n  }\n\n  // Scatter the inverse global FFT\n  MPI_Scatter(x.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local Fourier transforms on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate Fourier transforms to all ranks\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse Fourier transform\n  x = ifft(local_fft);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the result\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x, y);\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in-place on each rank\n  std::fft(x.begin(), x.end());\n\n  // Conjugate the imaginary part of each value\n  for (auto &v : x) {\n    v.imag() = -v.imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(), MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = std::move(y);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0, MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (auto &value : global_fft) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the local FFT on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> y_recv(x.size());\n      MPI_Recv(y_recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < y.size(); j++) {\n        y[j] += y_recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  MPI_FFT(x.data(), 1, x.size(), MPI_COMPLEX, MPI_FORWARD, MPI_COMM_WORLD, y.data(), &y.size());\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x = y;\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> tmp(x.size());\n      MPI_Recv(tmp.data(), tmp.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += tmp[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), y.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < y.size(); i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv(x.size());\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        y[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT of x in-place\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of the local data\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the local data to rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the gathered data to the output vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft(x);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the global FFT back to each rank\n  MPI_Scatter(global_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = local_fft;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fftConjugate(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = -x[i].x;\n  }\n}\n",
+            "\n\n  // Compute the FFT\n  fft(x, N);\n\n  // Compute the imaginary conjugate of each value\n  for (size_t i = 0; i < N; i++) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "fft{4,0}",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    x[tid].x = x[tid].x;\n    x[tid].y = -x[tid].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuConjf(x[i]);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[N - i - 1].x;\n    x[i].y = -x[N - i - 1].y;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = make_cuDoubleComplex(creal(x[i]), -cimag(x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i].x = x[i].x;\n    x[i].y = -x[i].y;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "07_fft_fft_conjugate",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the FFT of the local data in parallel\n  fft(x, start, end);\n\n  // Conjugate the imaginary part of the local data\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&x[start], n_local, MPI_DOUBLE_COMPLEX, &x[0], n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            ".4.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + rank * n_local, x.begin() + rank * n_local + n_local, y.begin());\n\n  // Conjugate the local FFT\n  for (int i = 0; i < n_local; i++) {\n    y[i] = std::conj(y[i]);\n  }\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the result to rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag() = -x[i].imag();\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Gather all the local FFT results on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value on rank 0\n  if (rank == 0) {\n    for (auto &value : globalX) {\n      value = std::conj(value);\n    }\n  }\n\n  // Scatter the imaginary conjugate results back to each rank\n  MPI_Scatter(globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform local FFT on each rank\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather all the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of each value\n  if (rank == 0) {\n    for (int i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate FFTs back to each rank\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Perform inverse FFT on each rank\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the imaginary conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the imaginary conjugate of the global FFT to all ranks\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n\n  // Gather the local FFTs on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); i++) {\n      x[i] = std::conj(x[i]);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> y(n_local);\n  fft(x.begin() + start, x.begin() + end, y.begin());\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  std::vector<std::complex<double>> z(n);\n  MPI_Gather(y.data(), n_local, MPI_DOUBLE_COMPLEX, z.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the results to the input vector on rank 0\n  if (rank == 0) {\n    x = z;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  std::vector<std::complex<double>> y(n);\n  for (int i = 0; i < n; i++) {\n    y[i] = std::conj(x[i]);\n  }\n  MPI_Gather(y.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> localX = x;\n  fft(localX);\n\n  // Conjugate the local FFT\n  for (auto &v : localX) {\n    v = std::conj(v);\n  }\n\n  // Gather the local FFTs on rank 0\n  std::vector<std::complex<double>> globalX(x.size());\n  MPI_Gather(localX.data(), localX.size(), MPI_DOUBLE_COMPLEX, globalX.data(), globalX.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global FFT in x\n  if (rank == 0) {\n    x = globalX;\n  }\n}\n",
+            "\n\n  // Compute the FFT in parallel using MPI and OpenMP\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(&x, x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Compute the imaginary conjugate of each value\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(recv.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT on each rank\n  std::vector<std::complex<double>> y(x.size());\n  fft(x, y);\n\n  // Conjugate the imaginary part of each value\n  #pragma omp parallel for\n  for (int i = 0; i < y.size(); i++) {\n    y[i].imag(-y[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    x.resize(y.size());\n    MPI_Gather(y.data(), y.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(x.size());\n  fft(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size() * size);\n  MPI_Gather(local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the conjugate of the global FFT\n  if (rank == 0) {\n    for (size_t i = 0; i < global_fft.size(); i++) {\n      global_fft[i] = std::conj(global_fft[i]);\n    }\n  }\n\n  // Scatter the conjugate global FFT\n  MPI_Scatter(global_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, local_fft.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the inverse FFT\n  ifft(local_fft, x);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the FFT in parallel\n  fft(x);\n\n  // Conjugate the imaginary parts\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i].imag(-x[i].imag());\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(Kokkos::View<Kokkos::complex<double>*> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = x[i] / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x[i] = std::conj(x[i]);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / static_cast<double>(x.extent(0));\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA (const int i) {\n    x(i) /= x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  // compute fourier transform\n  fft(x);\n\n  // compute inverse fourier transform\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n  Kokkos::parallel_for(\"ifft\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = std::conj(x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ifft\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = x(i) / x.extent(0);\n  });\n  fft(x);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated input\n  fft(x);\n\n  // compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "-",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated input\n  fft(x);\n\n  // Conjugate the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Divide the output by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // conjugate the complex numbers\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute forward FFT\n  fft(x);\n\n  // conjugate the complex numbers again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    fft(x[i]);\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  fft(x);\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // divide by n to normalize\n  const double n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] /= n;\n  }\n}\n",
+            "\n\n  // compute the forward FFT\n  fft(x);\n\n  // compute the inverse FFT\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the forward FFT again to get the inverse FFT\n  fft(x);\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugated vector\n  fft(x);\n\n  // divide each element by the size of the vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // compute the conjugate of each element\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  std::vector<std::complex<double>> y(n);\n  std::copy(x.begin(), x.end(), y.begin());\n  fft(y);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    y[i] = std::conj(y[i]) / n;\n  }\n  std::reverse(y.begin(), y.end());\n  fft(y);\n  std::copy(y.begin(), y.end(), x.begin());\n}\n",
+            "\n\n  fft(x);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // compute forward fft\n  fft(x);\n\n  // compute inverse fft\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute forward fft again to get inverse fft\n  fft(x);\n}\n",
+            "\n\n  // compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // compute the fft of the conjugate of x\n  fft(x);\n\n  // compute the conjugate of x again\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // divide each element of x by the size of x\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n}\n",
+            "\n\n  fft(x);\n  const double n = x.size();\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    x[i] = std::conj(x[i]) / n;\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+            "\n\n  // Compute the conjugate of the input\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Divide each element by the size of the input\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Compute the conjugate of the output\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "serial",
+        "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n  std::reverse(x.begin(), x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n  for (auto &xi : x) {\n    xi = std::conj(xi) / x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n  std::reverse(x.begin() + 1, x.end());\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0209267332,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208424628
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  std::reverse(x.begin() + 1, x.end());\n  fft(x);\n  for (auto &v : x) {\n    v /= x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n  fft(x);\n  for (auto &v : x) {\n    v = std::conj(v) / x.size();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                         -sin(2 * M_PI * i * j / N));\n    sum += w * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], c));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0.0, 0.0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex c = {cos(angle), sin(angle)};\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x[j] * hipDoubleComplex(cos(angle), sin(angle));\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i >= N) return;\n  hipDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * j * i / N;\n    hipDoubleComplex t = {cos(angle), sin(angle)};\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      hipDoubleComplex w = hipDoubleComplex(cos(2 * M_PI * i * j / N),\n                                           sin(2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0.0, -2.0 * M_PI * i * j / N));\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      hipDoubleComplex t = {cos(angle), sin(angle)};\n      sum = hipCadd(sum, hipCmul(x[j], t));\n    }\n    x[i] = hipCdiv(sum, N);\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex term = hipExp(hipCreateDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum = hipAdd(sum, hipMul(x[j], term));\n    }\n    x[i] = hipDiv(sum, hipCreateDoubleComplex(N, 0));\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex(0, -2 * M_PI * i * j / N));\n      sum += x[j] * w;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = hipDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * j * i / N;\n      hipDoubleComplex c = hipDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  size_t j = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (i < N && j < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t k = 0; k < N; ++k) {\n      double theta = 2 * M_PI * (i * k / N);\n      sum.x += x[k].x * cos(theta) + x[k].y * sin(theta);\n      sum.y += -x[k].x * sin(theta) + x[k].y * cos(theta);\n    }\n    sum.x /= N;\n    sum.y /= N;\n    x[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += w * x[j];\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(recv_buffer.data(), x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, y.data(), x.size(),\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_COMPLEX_DOUBLE, nullptr, 0,\n               MPI_COMPLEX_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the FFT\n  fft(x);\n\n  // compute the inverse FFT\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &xi : x) {\n    xi /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(),\n                           MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= num_ranks;\n  }\n\n  // gather the results on rank 0\n  if (my_rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // compute the inverse fft\n  for (auto &xi : x) {\n    xi = std::conj(xi);\n  }\n  fft(x);\n\n  // normalize the result\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the result on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> result(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, result.data(),\n                           x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = result;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]) / x.size();\n  }\n\n  // compute the inverse fft in parallel\n  fft(x);\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(n);\n    MPI::COMM_WORLD.Gather(x.data(), n, MPI::DOUBLE, y.data(), n, MPI::DOUBLE, 0);\n    x = y;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the forward fft\n  fft(x);\n\n  // conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // compute the inverse fft\n  fft(x);\n\n  // normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(),\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0,\n               MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the inverse fourier transform of x in-place.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\n__global__ void ifft(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      sum += x[j] * cuCexp(make_cuDoubleComplex(0, -2 * M_PI * i * j / N));\n    }\n    x[i] = sum / make_cuDoubleComplex(N, 0);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = cuCdiv(x[i], N);\n    x[i] = cuCmulf(x[i], cuCexp(cuCmulf(cuCmul(cuCimagf(x[i]), -2 * M_PI), i / N)));\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx >= size) {\n    return;\n  }\n  cuDoubleComplex result = cuCexp(cuCmul(cuDoubleComplex(0, -2 * M_PI * idx / size), x[idx]));\n  x[idx] = result;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      cuDoubleComplex term = x[j] * cexp(-2 * M_PI * i * j / N);\n      sum = cuCadd(sum, term);\n    }\n    x[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n    sum += x[j] * t;\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double re = x[i].x;\n    double im = x[i].y;\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      x[i].x += re * cos(angle) - im * sin(angle);\n      x[i].y += re * sin(angle) + im * cos(angle);\n    }\n    x[i].x /= N;\n    x[i].y /= N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    x[i] = x[i] / N;\n    x[i] = cuCmul(x[i], make_cuDoubleComplex(1.0, 0.0));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * term;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "_t_ = 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    x[i] = sum / N;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i >= N) return;\n  cuDoubleComplex sum = {0, 0};\n  for (size_t j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    cuDoubleComplex c = {cos(angle), sin(angle)};\n    sum += c * x[j];\n  }\n  x[i] = sum / N;\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex t = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * t;\n    }\n    x[i] = sum / N;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "05_fft_inverse_fft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector<std::complex<double>> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. The final result is stored on rank 0.\n   Example:\n   \n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform FFT on each rank\n  fft(x);\n\n  // Gather results from all ranks to rank 0\n  std::vector<std::complex<double>> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the final result on rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n\n  // Compute the inverse FFT of the normalized result\n  fft(x);\n\n  // Conjugate the result again to get the original values\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n}\n",
+            "\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, y.data(), x.size(), MPI::DOUBLE, 0);\n    x = y;\n  } else {\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI::DOUBLE, nullptr, 0, MPI::DOUBLE, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result among the ranks\n  std::vector<std::complex<double>> recv(x.size() / size);\n  MPI_Allgather(&x[rank * x.size() / size], x.size() / size, MPI_COMPLEX, &recv[0], x.size() / size, MPI_COMPLEX, MPI_COMM_WORLD);\n\n  // only rank 0 has the complete result\n  if (rank == 0) {\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide by the number of ranks to normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buffer[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += recv_buffer[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the inverse fourier transform of x in-place.\n  // Use MPI and OpenMP to compute in parallel.\n  // Assume MPI has already been initialized.\n  // Every rank has a complete copy of x.\n  // The final result is stored on rank 0.\n\n  // Get the number of ranks and the rank of the current process.\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local size of the vector.\n  int local_size = x.size() / num_procs;\n\n  // Compute the local inverse fourier transform.\n  fft(x);\n\n  // Reduce the local inverse fourier transforms to the root process.\n  std::vector<std::complex<double>> reduced_x(x.size());\n  MPI_Reduce(x.data(), reduced_x.data(), x.size(), MPI_COMPLEX, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // If the current process is the root process, store the reduced inverse fourier transform in the output vector.\n  if (rank == 0) {\n    x = reduced_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform\n  fft(x);\n\n  // divide by the number of ranks to normalize\n  for (auto &xi : x) {\n    xi /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), y.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft\n  fft(x);\n\n  // compute the inverse fft\n  std::transform(x.begin(), x.end(), x.begin(), [](std::complex<double> c) { return std::conj(c); });\n  fft(x);\n\n  // normalize the result\n  std::transform(x.begin(), x.end(), x.begin(), [size](std::complex<double> c) { return c / size; });\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft in-place\n  fft(x);\n\n  // compute the inverse fft in-place\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the inverse fourier transform in-place\n  fft(x);\n\n  // divide the result by the number of ranks\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results from all ranks to rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&y[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < x.size(); j++) {\n        x[j] += y[j];\n      }\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  for (size_t i = 0; i < x.size(); i++) {\n    x[i] /= x.size();\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size());\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, i, 0);\n      for (size_t j = 0; j < x.size(); j++) {\n        x[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], x.size(), MPI_DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  // Compute the conjugate of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the FFT of the conjugated x\n  fft(x);\n\n  // Normalize the result\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  double factor = 1.0 / size;\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] *= factor;\n  }\n\n  // If rank 0, gather the results from all ranks\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buffer(x.size() * size);\n    MPI_Allgather(x.data(), x.size(), MPI_COMPLEX, recv_buffer.data(), x.size(), MPI_COMPLEX, MPI_COMM_WORLD);\n    x = recv_buffer;\n  }\n}\n",
+            "\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(x.size() * size);\n    MPI::COMM_WORLD.Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, recv.data(), x.size(), MPI_DOUBLE_COMPLEX, 0);\n    x = recv;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fft of x\n  fft(x);\n\n  // compute the inverse fft of x\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the FFT of x\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Compute the inverse FFT of the conjugated result\n  fft(x);\n\n  // Normalize the result\n  int n = x.size();\n  for (auto &v : x) {\n    v /= n;\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::vector<std::complex<double>> recv(n);\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI::COMM_WORLD.Recv(&recv[0], n, MPI::DOUBLE_COMPLEX, i, 0);\n      for (int j = 0; j < n; ++j) {\n        x[j] += recv[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&x[0], n, MPI::DOUBLE_COMPLEX, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the inverse Fourier transform in-place\n  fft(x);\n\n  // Conjugate the result\n  for (auto &v : x) {\n    v = std::conj(v);\n  }\n\n  // Normalize the result\n  for (auto &v : x) {\n    v /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, y.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // compute the fourier transform\n  fft(x);\n\n  // compute the inverse fourier transform\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n  fft(x);\n\n  // normalize the result\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> y(x.size());\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &y[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = y;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, NULL, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the forward FFT\n  fft(x);\n\n  // Conjugate the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] = std::conj(x[i]);\n  }\n\n  // Compute the inverse FFT\n  fft(x);\n\n  // Normalize the result\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    x[i] /= size;\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> recv_buf(x.size() * size);\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, &recv_buf[0], x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = recv_buf;\n  } else {\n    MPI_Gather(&x[0], x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(Kokkos::View<const double*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      sum += x(j) * std::exp(-2 * PI * std::complex<double>(0, 1) * i * j / N);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = -2 * Kokkos::Math<double>::pi() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const double PI = 3.14159265358979323846;\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * PI * j * i / N;\n      sum += x[j] * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::Range(0, N), [=](const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::exp(std::complex<double>(0.0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int n = x.size();\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"dft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i) {\n                         Kokkos::complex<double> sum = 0;\n                         for (int j = 0; j < N; j++) {\n                           sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / N);\n                         }\n                         output(i) = sum;\n                       });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"DFT\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const auto n = x.size();\n  const auto pi = 3.14159265358979323846;\n  Kok\u30b3\u30af::Range range(0, n);\n  Kok\u30b3\u30af::ParallelFor(\"dft\", range, KOKKOK_LAMBDA(const int i) {\n    Kok\u30b3\u30af::\u8907<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2.0 * pi * i * j / n);\n    }\n    output[i] = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  const double pi = 3.14159265358979323846;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * Kokkos::complex<double>(cos(-2 * pi * i * j / n), sin(-2 * pi * i * j / n));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * j * i / N;\n      sum += x(j) * std::polar(1.0, angle);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0, 0);\n    for (int j = 0; j < N; ++j) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "(121:1:1:1:1:1:1:",
+            "\n\n  const int n = x.extent(0);\n  const double two_pi = 2.0 * Kokkos::Pi;\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; ++j) {\n      sum += x(j) * std::exp(-two_pi * Kokkos::complex<double>(0.0, 1.0) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma opm parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      const double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (size_t j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            ")",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "<",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0987622966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0944706162
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, -angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0983249436,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1207064788
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      sum += x[t] * std::polar(1.0, -2 * M_PI * k * t / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0987789118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0944943404
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.098830728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0988722076
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::polar(1.0, angle);\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; t++) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(x,y)",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  double sum_real = 0;\n  double sum_imag = 0;\n\n  for (size_t i = 0; i < N; i++) {\n    double angle = 2 * M_PI * gid * i / N;\n    sum_real += x[i] * cos(angle);\n    sum_imag += x[i] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCadd(0, 0);\n    for (int k = 0; k < N; k++) {\n      double angle = -2 * M_PI * i * k / N;\n      output[i] = cuCadd(output[i], cuCmulf(x[k], cuCexp(cuCmulf(0, angle))));\n    }\n  }\n}\n",
+            "\n\n  int id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < N) {\n    double sum_real = 0, sum_imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * id * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[id] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double real = 0;\n    double imag = 0;\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      real += x[i] * cos(angle);\n      imag += x[i] * sin(angle);\n    }\n    output[tid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleReal sum_real = 0;\n    cuDoubleReal sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplexf(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum += x[j] * make_cuDoubleComplex(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = bid * blockDim.x + tid;\n\n  if (gid >= N) {\n    return;\n  }\n\n  double real = 0;\n  double imag = 0;\n  for (size_t k = 0; k < N; ++k) {\n    double angle = 2 * M_PI * k * gid / N;\n    real += x[k] * cos(angle);\n    imag += x[k] * sin(angle);\n  }\n\n  output[gid] = make_cuDoubleComplex(real, imag);\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      sum_real += x[j] * cos(angle);\n      sum_imag += x[j] * sin(angle);\n    }\n    output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double real = 0;\n    double imag = 0;\n    for (size_t k = 0; k < N; k++) {\n      double angle = -2 * M_PI * k * gid / N;\n      real += x[k] * cos(angle);\n      imag += x[k] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(real, imag);\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = 2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[i] * term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      local_output[i] += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / x.size());\n    }\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the output\n  int start = rank * n / size;\n  int end = (rank + 1) * n / size;\n\n  for (int i = start; i < end; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * i * j / n;\n      sum += std::complex<double>(x[j], 0) * std::exp(std::complex<double>(0, angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each rank\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all ranks\n  std::vector<std::complex<double>> all_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, all_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the gathered results to output\n  if (rank == 0) {\n    output = all_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; k++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = -2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::polar(1.0, -2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local contribution of each process\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[j] += x[i] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n  }\n\n  // Reduce the local contributions to the global result\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local DFT\n  for (int i = rank; i < n; i += size) {\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      output[i] += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n  }\n\n  // Reduce the local DFTs to the global DFT\n  MPI_Allreduce(MPI_IN_PLACE, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\n__global__ void dft(const double *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0.0, 0.0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int i = 0; i < N; i++) {\n      double angle = -2 * M_PI * tid * i / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[i];\n      sum += term;\n    }\n    output[tid] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int k = 0; k < N; ++k) {\n      double angle = -2 * M_PI * i * k / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[k] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * c;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t kBlockSize = 256;\n  const size_t kGridSize = (N + kBlockSize - 1) / kBlockSize;\n  const size_t kSharedMemorySize = 2 * kBlockSize;\n\n  dim3 blockDim(kBlockSize, 1, 1);\n  dim3 gridDim(kGridSize, 1, 1);\n  shared double sdata[2 * kBlockSize];\n\n  for (size_t i = 0; i < N; ++i) {\n    output[i] = make_cuDoubleComplex(0, 0);\n  }\n\n  for (size_t i = 0; i < N; ++i) {\n    sdata[2 * threadIdx.x + 0] = x[i];\n    sdata[2 * threadIdx.x + 1] = 0;\n  }\n\n  __syncthreads();\n\n  for (size_t k = 0; k < N; ++k) {\n    double e = -2 * M_PI * k / N;\n    double c = cos(e);\n    double s = sin(e);\n    double t0 = 1;\n    double t1 = 0;\n    for (size_t i = 0; i < N / 2; ++i) {\n      double a = sdata[2 * i + 0];\n      double b = sdata[2 * i + 1];\n      double ta = t0 * a - t1 * b;\n      double tb = t0 * b + t1 * a;\n      sdata[2 * i + 0] = ta;\n      sdata[2 * i + 1] = tb;\n      t0 = c;\n      t1 = s;\n      c = t0 * c - t1 * s;\n      s = t0 * s + t1 * c;\n    }\n    __syncthreads();\n    output[k] = make_cuDoubleComplex(sdata[0], sdata[1]);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x;\n  double sum_real = 0;\n  double sum_imag = 0;\n  for (int j = 0; j < N; j++) {\n    double angle = -2 * M_PI * i * j / N;\n    sum_real += x[j] * cos(angle);\n    sum_imag += x[j] * sin(angle);\n  }\n  output[i] = make_cuDoubleComplex(sum_real, sum_imag);\n}\n",
+            "\n  const int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  const int stride = blockDim.x * gridDim.x;\n  for (size_t i = tid; i < N; i += stride) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (size_t j = 0; j < N; j++) {\n      const double angle = 2 * M_PI * j * i / N;\n      const cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(x[j], term));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = tid + bid * blockDim.x;\n  if (gid < N) {\n    double sum_real = 0;\n    double sum_imag = 0;\n    for (size_t i = 0; i < N; i++) {\n      double angle = -2 * M_PI * gid * i / N;\n      sum_real += x[i] * cos(angle);\n      sum_imag += x[i] * sin(angle);\n    }\n    output[gid] = make_cuDoubleComplex(sum_real, sum_imag);\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += w * x[j];\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex c = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum = cuCadd(sum, cuCmul(c, make_cuDoubleComplex(x[j], 0)));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = -2 * M_PI * i * j / N;\n      cuDoubleComplex term = make_cuDoubleComplex(cos(angle), sin(angle)) * x[j];\n      sum += term;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "06_fft_dft",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1, 4, 9, 16]\n   output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector<double> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  output.resize(n);\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_n = n / size;\n  std::vector<std::complex<double>> local_x(local_n);\n  std::vector<std::complex<double>> local_output(local_n);\n  std::copy(x.begin() + rank * local_n, x.begin() + (rank + 1) * local_n, local_x.begin());\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      sum += local_x[j] * std::exp(-2 * M_PI * i * j / local_n);\n    }\n    local_output[i] = sum;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_REAL, output.data(), local_n, MPI_DOUBLE_REAL, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT for each frequency\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = rank; i < n; i += nprocs) {\n    for (int j = 0; j < n; j++) {\n      std::complex<double> sum(0, 0);\n      for (int k = 0; k < n; k++) {\n        sum += x[k] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * k / n);\n      }\n      local_output[i] = sum;\n    }\n  }\n\n  // Gather the local results into the output vector\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial DFT for this rank\n  std::vector<std::complex<double>> partial_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    partial_output[i] = sum;\n  }\n\n  // Gather the partial results into the output vector\n  MPI_Gather(partial_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the DFT of each element in x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  MPI_Allgather(MPI_IN_PLACE, n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT in parallel using MPI and OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element of x in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of each element in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n\n  // Gather the results from all processes\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the results to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX,\n              output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * j * k / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(Kokkos::View<const Kokkos::complex<double>*> &x, Kokkos::View<Kokkos::complex<double>*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n  using namespace Kokkos::FFT;\n  using namespace Kokkos::Impl;\n\n  const int n = x.extent(0);\n  const int n_half = n / 2;\n  const int n_quarter = n / 4;\n  const int n_eighth = n / 8;\n\n  // Create a plan for the FFT\n  auto plan = FFTPlan<Kokkos::Cuda>::create(n);\n\n  // Execute the FFT\n  plan->execute(x, output);\n\n  // Reorder the output to be in the correct order\n  parallel_for(n, KOKKOS_LAMBDA(int i) {\n    if (i < n_half) {\n      output(i) = output(i);\n    } else {\n      output(i) = output(n - i);\n    }\n  });\n\n  // Compute the magnitude of the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) = abs(output(i)); });\n\n  // Normalize the output\n  parallel_for(n, KOKKOS_LAMBDA(int i) { output(i) /= n; });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n  int m = 1 << log_n;\n  Kokkos::View<Kokkos::complex<double>*> w(m);\n  for (int i = 0; i < m; i++) {\n    w(i) = Kokkos::complex<double>(cos(2 * M_PI * i / m), sin(2 * M_PI * i / m));\n  }\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int &i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < m; j++) {\n      sum += x((i + j) % m) * w(j);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n  using namespace Kokkos;\n  using namespace std;\n  const int N = x.extent(0);\n  const int M = output.extent(0);\n  const double pi = 3.14159265358979323846;\n  parallel_for(\"Fourier Transform\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    output(i) = {0.0, 0.0};\n    for (int j = 0; j < N; j++) {\n      output(i) += x(j) * exp(complex<double>(0, -2 * pi * i * j / N));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  int log_n = 0;\n  while ((1 << log_n) < n) {\n    log_n++;\n  }\n\n  Kokkos::View<Kokkos::complex<double>*> temp(\"temp\", n);\n\n  for (int i = 0; i < n; i++) {\n    int rev = 0;\n    for (int j = 0; j < log_n; j++) {\n      rev |= ((i >> j) & 1) << (log_n - j - 1);\n    }\n    temp(rev) = x(i);\n  }\n\n  for (int s = 1; s <= log_n; s++) {\n    int m = 1 << s;\n    int m2 = m >> 1;\n    Kokkos::complex<double> w(cos(2 * Kokkos::Pi / m), sin(2 * Kokkos::Pi / m));\n    for (int k = 0; k < n; k += m) {\n      Kokkos::complex<double> wm(1, 0);\n      for (int j = 0; j < m2; j++) {\n        Kokkos::complex<double> t = temp(k + j);\n        Kokkos::complex<double> u = temp(k + j + m2) * wm;\n        temp(k + j) = t + u;\n        temp(k + j + m2) = t - u;\n        wm *= w;\n      }\n    }\n  }\n\n  output = temp;\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n  const int halfN = N / 2;\n\n  // Reverse bits\n  Kokkos::View<int*> reversed = Kokkos::View<int*>(\"reversed\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n    int rev = 0;\n    int bit = 1;\n    for (int j = 0; j < logN; j++) {\n      if (i & bit) {\n        rev |= 1 << (logN - j - 1);\n      }\n      bit <<= 1;\n    }\n    reversed(i) = rev;\n  });\n\n  // Compute the FFT\n  Kokkos::View<Kokkos::complex<double>*> y = Kokkos::View<Kokkos::complex<double>*>(\"y\", N);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { y(i) = x(reversed(i)); });\n\n  for (int s = 1; s <= logN; s++) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    const double theta = 2 * M_PI / m;\n    Kokkos::complex<double> w(std::cos(theta), std::sin(theta));\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) {\n      if (i % m < m2) {\n        const Kokkos::complex<double> t = w * y(i + m2);\n        y(i + m2) = y(i) - t;\n        y(i) = y(i) + t;\n      }\n    });\n  }\n\n  // Copy the result to the output\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(int i) { output(i) = y(i); });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  const int logN = std::log2(N);\n\n  // Reverse bits\n  Kokkos::View<int*> reversed(N);\n  Kokkos::parallel_for(\"fft_reverse_bits\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int reversed_i = 0;\n    int j = i;\n    for (int k = 0; k < logN; ++k) {\n      reversed_i |= ((j & 1) << (logN - k - 1));\n      j >>= 1;\n    }\n    reversed(i) = reversed_i;\n  });\n\n  // Compute the FFT\n  Kokkos::parallel_for(\"fft_compute\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    int j = reversed(i);\n    output(i) = x(j);\n  });\n\n  // Compute the twiddle factors\n  Kokkos::View<Kokkos::complex<double>*> twiddle(N);\n  Kokkos::parallel_for(\"fft_twiddle\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double angle = -2 * M_PI * i / N;\n    twiddle(i) = Kokkos::complex<double>(std::cos(angle), std::sin(angle));\n  });\n\n  // Compute the FFT\n  for (int s = 1; s <= logN; ++s) {\n    const int m = 1 << s;\n    const int m2 = m / 2;\n    for (int k = 0; k < m2; ++k) {\n      Kokkos::parallel_for(\"fft_compute_s\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n        int j = i & (m - 1);\n        if (j < m2) {\n          const Kokkos::complex<double> t = twiddle(k * N / m) * output(i + m2);\n          output(i + m2) = output(i) - t;\n          output(i) += t;\n        }\n      });\n    }\n  }\n}\n",
+            "\n\n  using namespace Kokkos;\n  using namespace Kokkos::Experimental;\n\n  const int N = x.extent(0);\n  const int logN = Kokkos::ilog2(N);\n\n  auto bit_reverse = [&](int i) {\n    int r = 0;\n    for (int j = 0; j < logN; ++j) {\n      r |= ((i >> j) & 1) << (logN - 1 - j);\n    }\n    return r;\n  };\n\n  auto twiddle = [&](int k) {\n    return Kokkos::complex<double>(cos(2 * M_PI * k / N), sin(2 * M_PI * k / N));\n  };\n\n  auto fft_rec = [&](View<Kokkos::complex<double>*> x, int n) {\n    if (n == 1) {\n      return;\n    }\n    auto x_even = View<Kokkos::complex<double>*>(x, Range(0, n / 2));\n    auto x_odd = View<Kokkos::complex<double>*>(x, Range(n / 2, n));\n    fft_rec(x_even, n / 2);\n    fft_rec(x_odd, n / 2);\n    for (int k = 0; k < n / 2; ++k) {\n      auto t = x_odd(k) * twiddle(k);\n      x_odd(k) = x_even(k) - t;\n      x_even(k) += t;\n    }\n  };\n\n  auto x_bit_reversed = View<Kokkos::complex<double>*>(x.extent(0));\n  for (int i = 0; i < N; ++i) {\n    x_bit_reversed(bit_reverse(i)) = x(i);\n  }\n  fft_rec(x_bit_reversed, N);\n  output = x_bit_reversed;\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  if (n == 1) {\n    output(0) = x(0);\n    return;\n  }\n  Kokkos::View<Kokkos::complex<double>*> even = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(0, n / 2));\n  Kokkos::View<Kokkos::complex<double>*> odd = Kokkos::subview(output, Kokkos::ALL(), Kokkos::Slice(n / 2, n));\n  Kokkos::parallel_for(n / 2, KOKKOS_LAMBDA(int i) {\n    even(i) = x(2 * i);\n    odd(i) = x(2 * i + 1);\n  });\n  fft(even, even);\n  fft(odd, odd);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> w = Kokkos::complex<double>(cos(2 * M_PI * i / n), sin(2 * M_PI * i / n));\n    output(i) = even(i % (n / 2)) + w * odd(i % (n / 2));\n  });\n}\n",
+            "\n\n  // Compute the size of the input\n  const int n = x.extent(0);\n\n  // Create a scratchpad for the intermediate results\n  Kokkos::View<Kokkos::complex<double>*> scratchpad(\"scratchpad\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(const int i) {\n    // Compute the FFT of the i-th element of x\n    Kokkos::complex<double> result = std::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; ++j) {\n      result += x(j) * std::exp(-2.0 * M_PI * std::complex<double>(0.0, 1.0) * i * j / n);\n    }\n\n    // Store the result in the scratchpad\n    scratchpad(i) = result;\n  });\n\n  // Copy the results from the scratchpad to the output\n  Kokkos::deep_copy(output, scratchpad);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    double real = 0.0;\n    double imag = 0.0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      real += x(j).real() * cos(angle) + x(j).imag() * sin(angle);\n      imag += x(j).imag() * cos(angle) - x(j).real() * sin(angle);\n    }\n    output(i) = Kokkos::complex<double>(real, imag);\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    output[i] = Kokkos::complex<double>(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      output[i] += x[j] * Kokkos::exp(Kokkos::complex<double>(0.0, -2.0 * Kokkos::Pi * i * j / n));\n    }\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  output(0) = x(0);\n  for (int k = 1; k < n; ++k) {\n    Kokkos::complex<double> w = std::polar(1.0, -2 * Kokkos::Math<double>::pi * k / n);\n    Kokkos::complex<double> t = x(0);\n    for (int j = 1; j < n; ++j) {\n      t += w * x(j);\n      w *= w;\n      output(j) = t;\n    }\n  }\n}\n",
+            "\n\n  int n = x.extent(0);\n  output.resize(n);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * Kokkos::Impl::pi<double>() * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * std::exp(Kokkos::complex<double>(0, -angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  // Get the size of the input\n  int n = x.extent(0);\n\n  // Create a Kokkos view for the output\n  Kokkos::View<Kokkos::complex<double>*> output_view(\"output\", n);\n\n  // Compute the FFT in parallel\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), KOKKOS_LAMBDA(int i) {\n    output_view(i) = fft_helper(x, i, n);\n  });\n\n  // Copy the output view to the output array\n  Kokkos::deep_copy(output, output_view);\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(n, KOKKOS_LAMBDA(int i) {\n    Kokkos::complex<double> sum = 0;\n    for (int j = 0; j < n; j++) {\n      sum += x(j) * std::exp(-2 * M_PI * std::complex<double>(0, 1) * i * j / n);\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  int n = x.extent(0);\n  Kokkos::parallel_for(\"fft\", Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * j * i / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n",
+            "\n\n  const int n = x.extent(0);\n  output.resize(n);\n\n  Kokkos::parallel_for(\"fft\", n, KOKKOS_LAMBDA(const int i) {\n    Kokkos::complex<double> sum(0.0, 0.0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x(j) * Kokkos::complex<double>(cos(angle), sin(angle));\n    }\n    output(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma openmp parallel for\n  for (int i = 0; i < n; ++i) {\n    double real = 0, imag = 0;\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      real += x[j].first * cos(angle) + x[j].second * sin(angle);\n      imag += x[j].first * sin(angle) - x[j].second * cos(angle);\n    }\n    output[i] = {real, imag};\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> arg(2 * M_PI * i * j / n, 0);\n      sum += x[j] * std::exp(-arg);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "<",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::polar(1.0, angle);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t n = x.size();\n  output.resize(n);\n  const double pi = std::acos(-1);\n\n  #pragma omp parallel for\n  for (size_t k = 0; k < n; ++k) {\n    std::complex<double> sum = {0, 0};\n    for (size_t t = 0; t < n; ++t) {\n      double angle = 2 * pi * t * k / n;\n      sum += x[t] * std::complex<double>(std::cos(angle), std::sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum(0, 0);\n    for (int t = 0; t < n; ++t) {\n      double angle = 2 * M_PI * t * k / n;\n      sum += x[t] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> arg(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * arg;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the DFT of x\n  #pragma omp parallel for\n  for (int k = 0; k < n; ++k) {\n    std::complex<double> sum = 0;\n    for (int j = 0; j < n; ++j) {\n      sum += x[j] * std::exp(-2 * M_PI * std::complex<double>(0, 1) * j * k / n);\n    }\n    output[k] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      sum += x[j] * std::exp(-2 * M_PI * i * j / n);\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; ++j) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      sum += x[j] * std::complex<double>(cos(angle), sin(angle));\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / n), sin(2 * M_PI * i * j / n));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  output.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n; j++) {\n      std::complex<double> omega = std::polar(1.0, -2 * M_PI * i * j / n);\n      sum += x[j] * omega;\n    }\n    output[i] = sum;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double pi = std::acos(-1);\n  std::complex<double> w(1), wn(std::cos(2 * pi / n), std::sin(2 * pi / n));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(angle), sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2);\n  std::vector<std::complex<double>> odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double angle = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(angle), std::sin(angle));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  for (int i = 0; i < n; ++i) {\n    for (int j = 0; j < n; ++j) {\n      output[i] += x[j] * std::exp(-2 * std::complex<double>(0, 1) * M_PI * i * j / n);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  output.resize(n);\n  if (n == 1) {\n    output[0] = x[0];\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(std::cos(ang), std::sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (n == 1) {\n    output = x;\n    return;\n  }\n  std::vector<std::complex<double>> even(n / 2), odd(n / 2);\n  for (int i = 0; i < n / 2; i++) {\n    even[i] = x[2 * i];\n    odd[i] = x[2 * i + 1];\n  }\n  fft(even, even);\n  fft(odd, odd);\n  double ang = 2 * M_PI / n;\n  std::complex<double> w(1), wn(cos(ang), sin(ang));\n  for (int i = 0; i < n / 2; i++) {\n    output[i] = even[i] + w * odd[i];\n    output[i + n / 2] = even[i] - w * odd[i];\n    w *= wn;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const hipDoubleComplex *x, hipDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int bid = hipBlockIdx_x;\n  int gid = tid + bid * hipBlockDim_x;\n  if (gid >= N) {\n    return;\n  }\n  double arg = -2 * M_PI * gid / N;\n  hipDoubleComplex w = hipDoubleComplex(cos(arg), sin(arg));\n  hipDoubleComplex sum = hipDoubleComplex(0, 0);\n  for (int i = 0; i < N; i++) {\n    sum += x[i] * pow(w, i * gid);\n  }\n  output[gid] = sum;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n  if (gid < N) {\n    output[gid] = x[gid];\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex wp = hipDoubleComplex(1, 0);\n      for (size_t k = j; k < j + m; k++) {\n        hipDoubleComplex t = wp * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n        wp *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = make_hipDoubleComplex(cos(theta), sin(theta));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = make_hipDoubleComplex(1.0, 0.0);\n      for (size_t k = 0; k < m; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + m] * t;\n        output[j + k] = u + v;\n        output[j + k + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / s), sin(M_PI / s));\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    hipDoubleComplex wm = hipDoubleComplex(1.0, 0.0);\n    for (size_t j = 0; j < m; j++) {\n      for (size_t k = j; k < N; k += s) {\n        hipDoubleComplex t = wm * output[k + m];\n        output[k + m] = output[k] - t;\n        output[k] += t;\n      }\n      wm *= w;\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    hipDoubleComplex wm = hipDoubleComplex(1, 0);\n    for (size_t k = 0; k < m; k++) {\n      for (size_t j = 0; j < N / s; j++) {\n        size_t index = j * s + k;\n        hipDoubleComplex t = wm * output[index + m];\n        output[index + m] = output[index] - t;\n        output[index] += t;\n      }\n      wm *= w;\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t half_s = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / half_s), sin(M_PI / half_s));\n    for (size_t j = 0; j < N; j += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t k = 0; k < half_s; k++) {\n        hipDoubleComplex u = output[j + k];\n        hipDoubleComplex v = output[j + k + half_s] * t;\n        output[j + k] = u + v;\n        output[j + k + half_s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    hipDoubleComplex sum = {0, 0};\n    for (size_t j = 0; j < N; j++) {\n      hipDoubleComplex w = hipExp(hipDoubleComplex{0, -2 * M_PI * i * j / N});\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  const size_t tid = hipGetGlobalThreadIdx();\n  const size_t N_2 = N / 2;\n  const size_t N_4 = N / 4;\n  const double arg = -2 * M_PI / N;\n\n  hipDoubleComplex sum = {0, 0};\n  for (size_t k = 0; k < N; ++k) {\n    hipDoubleComplex w = {cos(arg * k), sin(arg * k)};\n    hipDoubleComplex term = hipCMul(x[k], w);\n    sum = hipCAdd(sum, term);\n  }\n\n  const size_t index = tid;\n  if (tid >= N_2) {\n    index -= N_2;\n    index = N_4 - index - 1;\n    index += N_2;\n  }\n\n  output[index] = sum;\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    double omega = M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(omega), sin(omega)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + s] * t;\n        output[k + j] = u + v;\n        output[k + j + s] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double theta = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex{cos(theta), sin(theta)};\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    double omega = 2 * M_PI / s;\n    hipDoubleComplex w = hipDoubleComplex(cos(omega), sin(omega));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n\n  for (size_t s = 1; s <= N; s *= 2) {\n    for (size_t k = 0; k < N; k += 2 * s) {\n      for (size_t j = k; j < k + s; j++) {\n        hipDoubleComplex twiddle = hipDoubleComplex{cos(M_PI * j / s),\n                                                   -sin(M_PI * j / s)};\n        hipDoubleComplex temp = output[j + s] * twiddle;\n        output[j + s] = output[j] - temp;\n        output[j] += temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 2; s <= N; s *= 2) {\n    size_t m = s / 2;\n    hipDoubleComplex w = hipDoubleComplex(cos(M_PI / m), sin(M_PI / m));\n    for (size_t k = 0; k < N; k += s) {\n      hipDoubleComplex t = hipDoubleComplex(1, 0);\n      for (size_t j = 0; j < m; j++) {\n        hipDoubleComplex u = output[k + j];\n        hipDoubleComplex v = output[k + j + m] * t;\n        output[k + j] = u + v;\n        output[k + j + m] = u - v;\n        t = t * w;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    output[i] = x[i];\n  }\n  __syncthreads();\n  for (size_t s = 1; s <= N; s *= 2) {\n    size_t m = s * 2;\n    hipDoubleComplex w = hipDoubleComplex{cos(M_PI / s), sin(M_PI / s)};\n    for (size_t k = 0; k < N; k += m) {\n      hipDoubleComplex wm = hipDoubleComplex{1, 0};\n      for (size_t j = 0; j < s; j++) {\n        hipDoubleComplex t = wm * output[k + j + s];\n        output[k + j + s] = output[k + j] - t;\n        output[k + j] += t;\n        wm *= w;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute the fourier transform of x. Store the result in output.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Store the result in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(n);\n  MPI_Gather(local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT to the local FFTs\n  MPI_Scatter(global_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Convert the local FFTs to the output format\n  output.resize(n);\n  for (int i = 0; i < n; i++) {\n    output[i] = local_fft[i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i / x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the output vector\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft_local(local_x, local_output);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(n);\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, global_output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the global FFT result to the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(),\n                 [](std::complex<double> const& x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) {\n    return std::exp(std::complex<double>(0, -2 * M_PI * x.real()));\n  });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform of x.\n  std::vector<std::complex<double>> local_fft = fft(x, rank, size);\n\n  // Gather the local Fourier transforms into output.\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * 1i * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int n_padded = n_local + 2;\n  std::vector<std::complex<double>> padded_x(n_padded, 0.0);\n  std::copy(x.begin() + rank * n_local, x.begin() + (rank + 1) * n_local, padded_x.begin() + 1);\n\n  std::vector<std::complex<double>> padded_output(n_padded, 0.0);\n  fft(padded_x, padded_output);\n\n  std::vector<std::complex<double>> output_local(n_local, 0.0);\n  std::copy(padded_output.begin() + 1, padded_output.begin() + n_local + 1, output_local.begin());\n\n  MPI_Allgather(output_local.data(), n_local, MPI_DOUBLE_PRECISION, output.data(), n_local, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  fft(local_x, local_output);\n\n  // Gather the results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If we are not rank 0, we can return now\n  if (rank != 0) {\n    return;\n  }\n\n  // If we are rank 0, we need to combine the results\n  for (int i = 1; i < size; i++) {\n    int start = i * local_n;\n    int end = start + local_n;\n    for (int j = start; j < end; j++) {\n      output[j] += output[j - start];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  std::transform(local_x.begin(), local_x.end(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local FFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  output.resize(n);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft(n / size);\n  MPI_Scatter(x.data(), n / size, MPI_DOUBLE_COMPLEX, local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  fft_local(local_fft);\n\n  // Gather the local FFTs\n  MPI_Gather(local_fft.data(), n / size, MPI_DOUBLE_COMPLEX, output.data(), n / size, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n\n  // Broadcast the global FFT\n  MPI_Bcast(output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft = fft_local(x, rank, size);\n\n  // Gather the local FFTs into a global FFT\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Allgather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the global FFT in the output vector\n  output = global_fft;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local Fourier transform\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  std::transform(x.begin() + rank * local_output.size(), x.begin() + (rank + 1) * local_output.size(), local_output.begin(), [](std::complex<double> x) { return std::exp(-2 * M_PI * std::complex<double>(0, 1) * x); });\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute the fourier transform of x. Store the result in output.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\n__global__ void fft(const cuDoubleComplex *x, cuDoubleComplex *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmul(2 * M_PI * i / N, cuCmulf(I, x[i])), -1));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuCmulf(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex(0, -2 * M_PI * i / N), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuDoubleComplex{0, -2 * M_PI * i / N}, x[i]));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex sum = make_cuDoubleComplex(0, 0);\n    for (int j = 0; j < N; j++) {\n      double angle = 2 * M_PI * j * i / N;\n      cuDoubleComplex w = make_cuDoubleComplex(cos(angle), sin(angle));\n      sum += x[j] * w;\n    }\n    output[i] = sum;\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(cuCmulf(2 * M_PI * i / N, cuDoubleComplex(0, 1)), x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(-2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    cuDoubleComplex y = cuCexp(cuCmul(cuCbrt(-1.0), 2.0 * M_PI * i / N));\n    output[i] = cuCmulf(x[i], y);\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCmul(x[i], cuCexp(cuCmulf(-2 * M_PI * i / N, cuDoubleComplex{0, 1})));\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n",
+            ":",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = cuCexp(cuCmulf(2 * M_PI * i / N, x[i]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "fft",
+        "language": "cpp",
+        "name": "09_fft_fft_out_of_place",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute the discrete fourier transform of x. Store the result in output.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Example:\n\n   input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n   output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector<std::complex<double>> const& x, std::vector<std::complex<double>> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local FFT results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n                global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX,\n                MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT of the gathered local DFTs.\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local results\n  std::vector<std::complex<double>> global_output(x.size());\n  MPI_Allgather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, global_output.data(), global_output.size(), MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n\n  // Store the result in output\n  output = global_output;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is the root process, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  // Get the size of the input vector.\n  int n = x.size();\n\n  // Create a MPI communicator for the FFT.\n  MPI_Comm fft_comm;\n  MPI_Comm_split(MPI_COMM_WORLD, 0, MPI_THREAD_SINGLE, &fft_comm);\n\n  // Create an OpenMP parallel region for the FFT.\n#pragma openmp parallel\n  {\n    // Get the rank of the current thread.\n    int tid = openmp_get_thread_num();\n\n    // Compute the local FFT of the input vector.\n    std::vector<std::complex<double>> local_fft(n);\n    fft(x, local_fft);\n\n    // Gather the local FFTs into the output vector.\n#pragma openmp critical\n    {\n      for (int i = 0; i < n; ++i) {\n        output[i] += local_fft[i];\n      }\n    }\n  }\n\n  // Free the MPI communicator.\n  MPI_Comm_free(&fft_comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int n_local = n / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  std::vector<std::complex<double>> local_x(x.begin() + start, x.begin() + end);\n  std::vector<std::complex<double>> local_output(n_local);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < n_local; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < n_local; j++) {\n      double angle = 2 * M_PI * i * j / n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), n_local, MPI_DOUBLE_COMPLEX, output.data(), n_local, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = (rank + 1) * local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local FFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      std::complex<double> w(cos(2 * M_PI * i * j / local_n), sin(2 * M_PI * i * j / local_n));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local FFT results\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(x.size() / size);\n  fft_local(x, local_output, rank, size);\n\n  // Gather the local DFTs into output.\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE_COMPLEX,\n             output.data(), local_output.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into the global output\n  MPI_Allgather(&local_output[0], n, MPI_DOUBLE_COMPLEX, &output[0], n, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    std::complex<double> sum(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = 2 * M_PI * i * j / local_n;\n      std::complex<double> w(cos(angle), sin(angle));\n      sum += w * local_x[j];\n    }\n    local_output[i] = sum;\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX,\n              output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local FFT\n  std::vector<std::complex<double>> local_fft;\n  fft_local(x, local_fft);\n\n  // Gather the local FFTs\n  std::vector<std::complex<double>> global_fft(x.size());\n  MPI_Gather(local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX,\n             global_fft.data(), global_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n             MPI_COMM_WORLD);\n\n  // Compute the global FFT\n  if (rank == 0) {\n    fft_global(global_fft);\n  }\n\n  // Scatter the global FFT\n  MPI_ Scatter(global_fft.data(), global_fft.size() / size, MPI_DOUBLE_COMPLEX,\n               local_fft.data(), local_fft.size(), MPI_DOUBLE_COMPLEX, 0,\n               MPI_COMM_WORLD);\n\n  // Convert the local FFT to the output format\n  output.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    output[i] = {local_fft[i].real(), local_fft[i].imag()};\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_n = n / size;\n  int local_start = rank * local_n;\n  int local_end = local_start + local_n;\n\n  std::vector<std::complex<double>> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::vector<std::complex<double>> local_output(local_n);\n\n  // Compute the local DFT\n  #pragma omp parallel for\n  for (int i = 0; i < local_n; i++) {\n    local_output[i] = std::complex<double>(0, 0);\n    for (int j = 0; j < local_n; j++) {\n      double angle = -2 * M_PI * i * j / local_n;\n      local_output[i] += local_x[j] * std::exp(std::complex<double>(0, angle));\n    }\n  }\n\n  // Gather the local DFTs into the global output\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE_COMPLEX, output.data(), local_n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, compute the global DFT\n  if (rank == 0) {\n    fft_global(output);\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x.\n  std::vector<std::complex<double>> local_output(n);\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector.\n  std::vector<std::complex<double>> global_output(n * size);\n  MPI_Gather(local_output.data(), n, MPI_DOUBLE_COMPLEX, global_output.data(), n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If this is rank 0, copy the global DFT into the output vector.\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local DFT of x\n  std::vector<std::complex<double>> local_output(x.size());\n  fft_local(x, local_output);\n\n  // Gather the local DFTs into a global vector\n  std::vector<std::complex<double>> global_output(x.size() * size);\n  MPI_Gather(local_output.data(), x.size(), MPI_DOUBLE_COMPLEX, global_output.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, store the global DFT in output\n  if (rank == 0) {\n    output = global_output;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, Kokkos::View<const double*> &x, Kokkos::View<const double*> &y, Kokkos::View<double*> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<>(0, z.extent(0)), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.size(), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z[i] = alpha * x[i] + y[i];\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int &i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  const int N = x.extent(0);\n  Kokkos::parallel_for(\"axpy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"axpy\", z.extent(0), KOKKOS_LAMBDA(const int i) {\n    z(i) = alpha * x(i) + y(i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma simd\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0204707574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0204438328
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0208522566,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208651052
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0205587766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0218323698
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.020720952,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020669281
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0207819748,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207678842
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0209998648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208709682
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (std::size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0205924882,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020607232
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0206821508,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206250696
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0210667086,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0210141762
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0206762918,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206953556
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.020854956,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207984678
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0205417108,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0205019656
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0206614878,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020646077
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0204760532,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020415435
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0206920114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0206929954
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0208396172,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0207929898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0208023612,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.020852023
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0209880088,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0209506082
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.020882368,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0208780596
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0206808084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.02062323
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "(x,y,z,t)",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  z[i] = alpha * x[i] + y[i];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (size_t i = 0; i < x.size(); ++i) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            ".",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    z.resize(x.size());\n    for (size_t i = 0; i < x.size(); i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Barrier(MPI_COMM_WORLD);\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], x.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  if (rank == 0) {\n    for (int i = 0; i < x.size(); i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], z.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n\n  // Compute the local portion of z\n  std::vector<double> local_z(n);\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  if (rank == 0) {\n    z.resize(n);\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Every rank computes its part of the result\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) { z[i] = alpha * x[i] + y[i]; }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vectors\n  int local_size = x.size() / size;\n\n  // Allocate memory for the local portion of the vectors\n  std::vector<double> local_x(local_size);\n  std::vector<double> local_y(local_size);\n  std::vector<double> local_z(local_size);\n\n  // Scatter the vectors to the ranks\n  MPI_Scatter(&x[0], local_size, MPI_DOUBLE, &local_x[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(&y[0], local_size, MPI_DOUBLE, &local_y[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of the result\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the results from the ranks\n  MPI_Gather(&local_z[0], local_size, MPI_DOUBLE, &z[0], local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  int n = x.size();\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  \n  if (rank != 0) {\n    MPI_Send(&z[0], n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> tmp(n);\n      MPI_Recv(&tmp[0], n, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (int j = 0; j < n; j++) {\n        z[j] += tmp[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of z\n  for (int i = rank; i < x.size(); i += size) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(z.data(), x.size() / size, MPI_DOUBLE, z.data(), x.size() / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n    for (int i = 0; i < n; i++) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Bcast(&z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\n__global__ void axpy(double alpha, const double *x, const double *y, double *z, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "{,",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    z[i] = alpha * x[i] + y[i];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    z[tid] = alpha * x[tid] + y[tid];\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    z[idx] = alpha * x[idx] + y[idx];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "03_dense_la_axpy",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x and y. Store the result in z on rank 0.\n   Example:\n   \n   input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n   output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector<double> const& x, std::vector<double> const& y, std::vector<double> &z) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, n_threads;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  #pragma omp parallel\n  {\n    n_threads = omp_get_num_threads();\n  }\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "(",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  std::vector<double> local_z(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    local_z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(local_z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n#pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the vector\n  int local_size = x.size() / size;\n\n  // Compute the local start index\n  int local_start = rank * local_size;\n\n  // Compute the local end index\n  int local_end = (rank + 1) * local_size;\n\n  // Compute the local part of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[local_start + i] + y[local_start + i];\n  }\n\n  // Gather the local results into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete result\n  if (rank == 0) {\n    z.resize(x.size());\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / nprocs;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of the vector\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of the vector\n  int start_index = rank * local_size;\n\n  // Compute the local portion of the vector\n  std::vector<double> local_x(x.begin() + start_index, x.begin() + start_index + local_size);\n  std::vector<double> local_y(y.begin() + start_index, y.begin() + start_index + local_size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * local_x[i] + local_y[i];\n  }\n\n  // Gather the local portions of the result into the global result\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  if (rank == 0) {\n    z.resize(n);\n  }\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    if (rank == 0) {\n      z[i] = alpha * x[i] + y[i];\n    }\n  }\n  MPI_Gather(z.data(), n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of the local portion of x and y\n  int local_size = x.size() / size;\n\n  // Compute the starting index of the local portion of x and y\n  int start_index = rank * local_size;\n\n  // Compute the local portion of z\n  std::vector<double> local_z(local_size);\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; ++i) {\n    local_z[i] = alpha * x[start_index + i] + y[start_index + i];\n  }\n\n  // Gather the local portions of z into the global vector z on rank 0\n  MPI_Gather(local_z.data(), local_size, MPI_DOUBLE, z.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each block\n  int block_size = x.size() / size;\n\n  // Compute the starting index of each block\n  int start_index = rank * block_size;\n\n  // Compute the ending index of each block\n  int end_index = start_index + block_size;\n\n  // If the rank is the last rank, then the ending index is the size of the vector\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result\n  std::vector<double> local_z(block_size);\n  for (int i = start_index; i < end_index; i++) {\n    local_z[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Reduce the local results to the global result\n  MPI_Allreduce(local_z.data(), z.data(), block_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  z.resize(n);\n\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the size of each chunk of data that each rank will process\n  int chunk_size = x.size() / size;\n\n  // Compute the starting index of the data that each rank will process\n  int start_index = rank * chunk_size;\n\n  // Compute the ending index of the data that each rank will process\n  int end_index = (rank + 1) * chunk_size;\n\n  // If the rank is the last rank, then the ending index is the size of the data\n  if (rank == size - 1) {\n    end_index = x.size();\n  }\n\n  // Compute the local result for this rank\n  std::vector<double> local_result(chunk_size);\n  #pragma omp parallel for\n  for (int i = start_index; i < end_index; i++) {\n    local_result[i - start_index] = alpha * x[i] + y[i];\n  }\n\n  // Gather the local results from all ranks into the global result on rank 0\n  MPI_Gather(local_result.data(), chunk_size, MPI_DOUBLE, z.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = x.size();\n  z.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  MPI_Gather(&z[0], n, MPI_DOUBLE, &z[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    z[i] = alpha * x[i] + y[i];\n  }\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, n, MPI_DOUBLE, z.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(z.data(), n, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix. x and b have N elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(Kokkos::View<const double**> &A, Kokkos::View<const double*> &b, Kokkos::View<double*> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::Range(0, N), [=](const int i) {\n    double sum = 0;\n    for (int j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  KokConfigService config;\n  config.set_num_threads(KokConfigService::num_hardware_threads());\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokConfigService::set_configuration(config);\n  KokConfigService::print_configuration();\n\n  KokKok::parallel_for(N, KOK_LAMBDA(size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A[i][j] * x[j]; }\n    x[i] = (b[i] - sum) / A[i][i];\n  });\n}\n",
+            "\n\n  // Create a Kokkos view for the matrix A.\n  Kokkos::View<double**> A_view = Kokkos::View<double**>(\"A\", N, N);\n  Kokkos::deep_copy(A_view, A);\n\n  // Create a Kokkos view for the vector b.\n  Kokkos::View<double*> b_view = Kokkos::View<double*>(\"b\", N);\n  Kokkos::deep_copy(b_view, b);\n\n  // Create a Kokkos view for the vector x.\n  Kokkos::View<double*> x_view = Kokkos::View<double*>(\"x\", N);\n\n  // Solve the linear system using the Kokkos parallel backend.\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_view(i, j) * x_view(j);\n    }\n    x_view(i) = (b_view(i) - sum) / A_view(i, i);\n  });\n\n  // Copy the solution vector x back to the output view.\n  Kokkos::deep_copy(x, x_view);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define a Kokkos kernel functor to perform the computation.\n  Kokkos::KernelFunctor<policy> kernelFunctor([&](const int i) {\n    // Initialize the solution vector x.\n    x(i) = 0.0;\n\n    // Perform forward substitution to solve the linear system.\n    for (int j = 0; j < i; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Perform backward substitution to solve the linear system.\n    for (int j = i + 1; j < N; ++j) {\n      x(i) -= A(i, j) * x(j);\n    }\n\n    // Divide by the diagonal element to get the final solution.\n    x(i) /= A(i, i);\n  });\n\n  // Execute the kernel functor using the Kokkos team policy.\n  Kokkos::parallel_for(policy, kernelFunctor);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution.\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N);\n\n  // Create a Kokkos parallel loop to solve the linear system.\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Solve the ith equation of the linear system.\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x[j];\n    }\n\n    // Update the solution vector x\n    x[i] = (b[i] - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::View<double**> A_copy = Kokkos::View<double**>(\"A_copy\", N, N);\n  Kokkos::deep_copy(A_copy, A);\n  Kokkos::View<double*> b_copy = Kokkos::View<double*>(\"b_copy\", N);\n  Kokkos::deep_copy(b_copy, b);\n  Kokkos::View<double*> x_copy = Kokkos::View<double*>(\"x_copy\", N);\n  Kokkos::deep_copy(x_copy, x);\n\n  for (size_t i = 0; i < N; i++) {\n    double max_val = std::abs(A_copy(i, i));\n    size_t max_idx = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A_copy(j, i)) > max_val) {\n        max_val = std::abs(A_copy(j, i));\n        max_idx = j;\n      }\n    }\n    if (max_val == 0) {\n      std::cout << \"The matrix is singular and cannot be inverted.\" << std::endl;\n      return;\n    }\n    for (size_t j = 0; j < N; j++) {\n      std::swap(A_copy(i, j), A_copy(max_idx, j));\n    }\n    std::swap(b_copy(i), b_copy(max_idx));\n    for (size_t j = 0; j < N; j++) {\n      if (i != j) {\n        double factor = A_copy(j, i) / A_copy(i, i);\n        for (size_t k = 0; k < N; k++) {\n          A_copy(j, k) -= factor * A_copy(i, k);\n        }\n        b_copy(j) -= factor * b_copy(i);\n      }\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy(i, j) * x_copy(j);\n    }\n    x_copy(i) = (b_copy(i) - sum) / A_copy(i, i);\n  }\n  Kokkos::deep_copy(x, x_copy);\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int& i) {\n    // Initialize the solution vector x\n    x[i] = 0.0;\n\n    // Iterate over the rows of the matrix A\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the dot product of the ith row of A and the solution vector x\n      double dot_product = 0.0;\n      for (size_t k = 0; k < N; ++k) {\n        dot_product += A(i, k) * x[k];\n      }\n\n      // Update the ith element of the solution vector x\n      x[i] = (b[i] - dot_product) / A(i, i);\n    }\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to be executed in parallel\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    for (int j = 0; j < N; ++j) {\n      x(i) += A(i, j) * x(j);\n    }\n\n    // Subtract the dot product from b(i) to get the solution\n    x(i) = b(i) - x(i);\n  });\n}\n",
+            "\n\n  // Create a copy of A for in-place LU decomposition\n  auto A_copy = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), Kokkos::NO_OFFSET, Kokkos::NO_OFFSET);\n\n  // Perform LU decomposition\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      A_copy(i, j) /= A_copy(i, i);\n    }\n    for (size_t k = 0; k < N; ++k) {\n      for (size_t j = 0; j < N; ++j) {\n        if (k > i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        } else if (k < i) {\n          A_copy(k, j) -= A_copy(k, i) * A_copy(i, j);\n        }\n      }\n    }\n  });\n\n  // Solve the system\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    x(i) = b(i) / A_copy(i, i);\n    for (size_t j = 0; j < N; ++j) {\n      if (j > i) {\n        x(j) -= x(i) * A_copy(j, i);\n      }\n    }\n  });\n\n  // Backward substitution\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (j < i) {\n        x(j) -= x(i) * A_copy(i, j);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  // Create a Kokkos team policy for parallel execution\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(N, Kokkos::AUTO);\n\n  // Define the kernel lambda function to solve the linear system\n  Kokkos::parallel_for(\"solveLinearSystem\", policy, KOKKOS_LAMBDA(const int &i) {\n    // Initialize the solution vector x\n    x(i) = 0.0;\n\n    // Compute the dot product of the ith row of A and x\n    double dot_product = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A(i, j) * x(j);\n    }\n\n    // Update the solution vector x\n    x(i) = (b(i) - dot_product) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A(i, j) * x(j);\n    }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n",
+            "_t",
+            "\n\n  Kokkos::parallel_for(\"solveLinearSystem\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    x(i) = (b(i) - sum) / A(i, i);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1] / A[(N - 1) * N + (N - 1)];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double inv_aii = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] *= inv_aii;\n    }\n    b[i] *= inv_aii;\n\n    // Subtract the ith row from the other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  x[N - 1] = b[N - 1];\n  for (int i = N - 2; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_local[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A_local[i * N + j] *= factor;\n    }\n    b_local[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_local[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A_local[j * N + k] -= multiplier * A_local[i * N + k];\n      }\n      b_local[j] -= multiplier * b_local[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A_copy[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A_copy[j * N + k] -= multiplier * A_copy[i * N + k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a copy of A and b for each thread.\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system for each thread.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_local[j][i] / A_local[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_local[j][k] -= factor * A_local[i][k];\n      }\n      b_local[j] -= factor * b_local[i];\n    }\n\n    // Back substitution.\n    x[i] = b_local[i] / A_local[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    #pragma omp parallel for reduction(+:sum)\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A_local[i * N + j] * x[j];\n    }\n    x[i] = (b_local[i] - sum) / A_local[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A_copy[i][i];\n    for (size_t j = i; j < N; ++j) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A_copy[j][i];\n      for (size_t k = i; k < N; ++k) {\n        A_copy[j][k] -= multiplier * A_copy[i][k];\n      }\n      b_copy[j] -= multiplier * b_copy[i];\n    }\n  }\n\n  // Perform back substitution in parallel\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i][j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b to avoid modifying the input\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A_copy[i * N + j] *= factor;\n    }\n    b_copy[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Perform back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an N*N matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a vector to store the residual.\n  std::vector<double> r(N);\n\n  // Initialize the residual.\n  for (size_t i = 0; i < N; i++) {\n    r[i] = b[i];\n  }\n\n  // Solve the linear system using the conjugate gradient method.\n  for (size_t k = 0; k < N; k++) {\n    // Compute the dot product of the residual and the residual.\n    double r_dot_r = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_r)\n    for (size_t i = 0; i < N; i++) {\n      r_dot_r += r[i] * r[i];\n    }\n\n    // Check if the residual is small enough.\n    if (r_dot_r < 1e-10) {\n      break;\n    }\n\n    // Compute the dot product of the residual and the matrix-vector product.\n    double r_dot_Ax = 0.0;\n    #pragma omp parallel for reduction(+:r_dot_Ax)\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r_dot_Ax += r[i] * tmp;\n    }\n\n    // Compute the step size.\n    double alpha = r_dot_r / r_dot_Ax;\n\n    // Update the solution.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      x[i] += alpha * r[i];\n    }\n\n    // Update the residual.\n    #pragma omp parallel for\n    for (size_t i = 0; i < N; i++) {\n      double tmp = 0.0;\n      for (size_t j = 0; j < N; j++) {\n        tmp += A[i * N + j] * x[j];\n      }\n      r[i] -= alpha * tmp;\n    }\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for parallel computation\n  std::vector<double> A_copy(A);\n  std::vector<double> b_copy(b);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Forward elimination\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A_copy[j][i] / A_copy[i][i];\n      for (size_t k = 0; k < N; ++k) {\n        A_copy[j][k] -= factor * A_copy[i][k];\n      }\n      b_copy[j] -= factor * b_copy[i];\n    }\n  }\n\n  // Back substitution\n  x.resize(N);\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_copy[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n    x[i] /= A_copy[i][i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0\n  x.assign(N, 0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    #pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      #pragma omp parallel for\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for local computation\n  std::vector<double> A_local(A);\n  std::vector<double> b_local(b);\n\n  // Perform Gaussian elimination in parallel\n  #pragma omp parallel\n  {\n    // Each thread works on a different row of A\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A_local[i][i];\n      for (size_t j = i; j < N; ++j) {\n        A_local[i][j] /= factor;\n      }\n      b_local[i] /= factor;\n\n      // Subtract multiples of the ith row from the other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A_local[j][i];\n        for (size_t k = i; k < N; ++k) {\n          A_local[j][k] -= factor * A_local[i][k];\n        }\n        b_local[j] -= factor * b_local[i];\n      }\n    }\n  }\n\n  // Back substitution to solve for x\n  for (int i = N - 1; i >= 0; --i) {\n    x[i] = b_local[i];\n    for (int j = i + 1; j < N; ++j) {\n      x[i] -= A_local[i][j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  // Solve the linear system Ax=b for x.\n  // A is an NxN matrix in row-major. x and b have N elements.\n  // Use OpenMP to compute in parallel.\n\n  // Create a vector to store the solution.\n  x.resize(N);\n\n  // Create a temporary vector to store the intermediate results.\n  std::vector<double> tmp(N);\n\n  // Solve the linear system in parallel.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    // Compute the ith row of the solution.\n    tmp[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      tmp[i] -= A[i * N + j] * x[j];\n    }\n    tmp[i] /= A[i * N + i];\n  }\n\n  // Copy the solution to the output vector.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = tmp[i];\n  }\n}\n",
+            "\n\n  // Create OpenMP parallel region\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Initialize x to 0.\n  x.assign(N, 0);\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows.\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  // Forward substitution\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < i; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Backward substitution\n  #pragma omp parallel for\n  for (int i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n  }\n}\n",
+            "\n\n  // Create a copy of A and b for each thread\n  std::vector<std::vector<double>> A_local(omp_get_num_threads(), std::vector<double>(N * N));\n  std::vector<std::vector<double>> b_local(omp_get_num_threads(), std::vector<double>(N));\n\n  // Copy A and b to the local arrays\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_local[tid][i * N + j] = A[i * N + j];\n      }\n      b_local[tid][i] = b[i];\n    }\n  }\n\n  // Solve the linear system for each thread\n  #pragma omp parallel\n  {\n    int tid = omp_get_thread_num();\n    std::vector<double> x_local(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < i; j++) {\n        sum += A_local[tid][i * N + j] * x_local[j];\n      }\n      x_local[i] = (b_local[tid][i] - sum) / A_local[tid][i * N + i];\n    }\n\n    // Copy the local solution to the global solution\n    #pragma omp critical\n    {\n      for (size_t i = 0; i < N; i++) {\n        x[i] += x_local[i];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "serial",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  x.resize(N);\n  std::vector<double> y(N);\n  for (size_t i = 0; i < N; i++) {\n    y[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] -= sum;\n    x[i] = y[i] / A[i * N + i];\n  }\n  for (size_t i = N - 1; i > 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] -= sum;\n    x[i] /= A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(Ab[j * (N + 1) + i]) > std::abs(Ab[pivot_row * (N + 1) + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N + 1; ++j) {\n        std::swap(Ab[i * (N + 1) + j], Ab[pivot_row * (N + 1) + j]);\n      }\n    }\n\n    // Eliminate the current column below the pivot row\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = Ab[j * (N + 1) + i] / Ab[i * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        Ab[j * (N + 1) + k] -= factor * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Check if the system is consistent\n  for (size_t i = 0; i < N; ++i) {\n    if (std::abs(Ab[i * (N + 1) + N]) < 1e-9) {\n      throw std::runtime_error(\"The system is inconsistent.\");\n    }\n  }\n\n  // Solve the system by back substitution\n  x.resize(N);\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = Ab[i * (N + 1) + N];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= Ab[i * (N + 1) + j] * x[j];\n    }\n    x[i] /= Ab[i * (N + 1) + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<double> a = A;\n  std::vector<double> bb = b;\n  for (size_t i = 0; i < N; i++) {\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(a[j * N + i]) > std::abs(a[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    if (pivot != i) {\n      std::swap(a[i * N + i], a[pivot * N + i]);\n      std::swap(bb[i], bb[pivot]);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = a[j * N + i] / a[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        a[j * N + k] -= factor * a[i * N + k];\n      }\n      bb[j] -= factor * bb[i];\n    }\n  }\n  for (size_t i = N - 1; i < N; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += a[i * N + j] * x[j];\n    }\n    x[i] = (bb[i] - sum) / a[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<double> c(N);\n  for (size_t i = 0; i < N; i++) {\n    c[i] = b[i];\n  }\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double t = A[j][i] / A[i][i];\n      for (size_t k = 0; k < N; k++) {\n        A[j][k] -= t * A[i][k];\n      }\n      c[j] -= t * c[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    double t = c[i];\n    for (size_t j = i + 1; j < N; j++) {\n      t -= A[i][j] * x[j];\n    }\n    x[i] = t / A[i][i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create a copy of A\n  std::vector<double> A_copy(A);\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A_copy[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A_copy[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j * N + k] -= factor * A_copy[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Perform back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A_copy[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A_copy[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find pivot\n    size_t pivot = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot * N + i])) {\n        pivot = j;\n      }\n    }\n    // Swap rows\n    if (pivot != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot * N + j]);\n      }\n      std::swap(b[i], b[pivot]);\n    }\n    // Eliminate\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n    x[i] /= A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<std::vector<double>> aug(N, std::vector<double>(N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      aug[i][j] = A[i * N + j];\n    }\n    aug[i][N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double div = aug[i][i];\n    for (size_t j = 0; j <= N; ++j) {\n      aug[i][j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double mult = aug[j][i];\n      for (size_t k = 0; k <= N; ++k) {\n        aug[j][k] -= mult * aug[i][k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = aug[i][N];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gauss-Jordan elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows to zero out the ith column\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.127884544,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2979807782
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<double> A_copy = A;\n  std::vector<double> b_copy = b;\n  for (size_t i = 0; i < N; i++) {\n    double factor = A_copy[i][i];\n    for (size_t j = 0; j < N; j++) {\n      A_copy[i][j] /= factor;\n    }\n    b_copy[i] /= factor;\n    for (size_t j = i + 1; j < N; j++) {\n      double factor2 = A_copy[j][i];\n      for (size_t k = 0; k < N; k++) {\n        A_copy[j][k] -= factor2 * A_copy[i][k];\n      }\n      b_copy[j] -= factor2 * b_copy[i];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; i--) {\n    x[i] = b_copy[i];\n    for (size_t j = i + 1; j < N; j++) {\n      x[i] -= A_copy[i][j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = A[i * N + i];\n    for (size_t j = i; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = b[i] - sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1283011136,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2971803136
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap pivot row with current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> augmentedMatrix(N * (N + 1));\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      augmentedMatrix[i * (N + 1) + j] = A[i * N + j];\n    }\n    augmentedMatrix[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = augmentedMatrix[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; ++j) {\n      augmentedMatrix[i * (N + 1) + j] /= factor;\n    }\n\n    // Subtract multiples of the ith row from the other rows\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) continue;\n      double factor = augmentedMatrix[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; ++k) {\n        augmentedMatrix[j * (N + 1) + k] -= factor * augmentedMatrix[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution vector x\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = augmentedMatrix[i * (N + 1) + N];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.12827796,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2976558266
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Forward elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from the other rows\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back substitution\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Create an augmented matrix [A | b]\n  std::vector<double> Ab(N * (N + 1));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      Ab[i * (N + 1) + j] = A[i * N + j];\n    }\n    Ab[i * (N + 1) + N] = b[i];\n  }\n\n  // Perform Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double div = Ab[i * (N + 1) + i];\n    for (size_t j = 0; j < N + 1; j++) {\n      Ab[i * (N + 1) + j] /= div;\n    }\n\n    // Subtract multiples of the ith row from all other rows\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) continue;\n      double mult = Ab[j * (N + 1) + i];\n      for (size_t k = 0; k < N + 1; k++) {\n        Ab[j * (N + 1) + k] -= mult * Ab[i * (N + 1) + k];\n      }\n    }\n  }\n\n  // Extract the solution from the augmented matrix\n  for (size_t i = 0; i < N; i++) {\n    x[i] = Ab[i * (N + 1) + N];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1282061872,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2982738014
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "hip",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] -= sum * A[row * N + col] / A[col * N + col];\n  }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n    int bid = hipBlockIdx_x + hipGridDim_x * hipBlockIdx_y;\n    if (tid < N) {\n        double sum = 0;\n        for (int i = 0; i < N; i++) {\n            sum += A[bid * N + i] * x[i];\n        }\n        x[tid] = (b[bid] - sum) / A[bid * N + tid];\n    }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipThreadIdx_x;\n  int col = hipBlockIdx_x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int k = row + 1; k < N; k++) {\n    if (A[k * N + col] != 0) {\n      double factor = A[k * N + col] / A[row * N + col];\n      for (int j = 0; j < N; j++) {\n        A[k * N + j] -= factor * A[row * N + j];\n      }\n      b[k] -= factor * b[row];\n    }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  }\n  __syncthreads();\n  for (int i = row + 1; i < N; i++) {\n    double factor = A[i * N + col] / A[row * N + col];\n    for (int k = 0; k < N; k++) {\n      A[i * N + k] -= factor * A[row * N + k];\n    }\n    b[i] -= factor * b[row];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    A[row * N + col] -= A[col * N + col] * sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[col] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    x[i] += A[i * N + j] * b[j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < N && col < N) {\n    x[row] += A[row * N + col] * b[col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector object for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the solution to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast x to all ranks\n  MPI_Bcast(x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "_t*",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<double> local_x(N, 0);\n  std::vector<double> local_b(N, 0);\n  for (size_t i = 0; i < N; i++) {\n    local_b[i] = b[i];\n  }\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * local_b[j];\n    }\n  }\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct solution\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a portion of x\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_idx = rank * chunk_size;\n  size_t end_idx = std::min(start_idx + chunk_size, N);\n\n  std::vector<double> local_x(chunk_size);\n  for (size_t i = start_idx; i < end_idx; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    local_x[i - start_idx] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), chunk_size, MPI_DOUBLE, x.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solutions from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x.resize(N);\n    for (size_t i = 0; i < N; i++) {\n      double sum = 0;\n      for (size_t j = 0; j < N; j++) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from rank 0 to all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Perform Gaussian elimination\n    for (size_t i = 0; i < N; ++i) {\n      // Normalize the ith row\n      double factor = A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        A[i * N + j] /= factor;\n      }\n      b[i] /= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Solve the triangular system\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j >= 0; --j) {\n        b[j] -= A[j * N + i] * b[i];\n      }\n    }\n\n    // Store the solution in x\n    x = b;\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system locally on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on rank 0\n  if (rank == 0) {\n    // Gaussian elimination\n    for (size_t i = 0; i < N; i++) {\n      // Normalize the ith row\n      double factor = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; j++) {\n        A[i * N + j] *= factor;\n      }\n      b[i] *= factor;\n\n      // Subtract the ith row from all other rows\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i];\n        for (size_t k = 0; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n\n    // Back substitution\n    for (int i = N - 1; i >= 0; i--) {\n      x[i] = b[i];\n      for (int j = i + 1; j < N; j++) {\n        x[i] -= A[i * N + j] * x[j];\n      }\n    }\n  }\n\n  // Broadcast the solution to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank solves a part of the system\n  for (size_t i = rank; i < N; i += size) {\n    double factor = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Broadcast the updated row\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, rank, comm);\n    MPI_Bcast(&b[i], 1, MPI_DOUBLE, rank, comm);\n\n    // Update the remaining rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Gather the solution from all ranks\n  MPI_Gather(&b[rank], 1, MPI_DOUBLE, &x[0], 1, MPI_DOUBLE, 0, comm);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system locally\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions to rank 0\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on rank 0\n  if (rank == 0) {\n    // Create a matrix object\n    Eigen::MatrixXd A_eigen(N, N);\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        A_eigen(i, j) = A[i * N + j];\n      }\n    }\n\n    // Create a vector for b\n    Eigen::VectorXd b_eigen(N);\n    for (size_t i = 0; i < N; i++) {\n      b_eigen(i) = b[i];\n    }\n\n    // Solve the linear system\n    Eigen::VectorXd x_eigen = A_eigen.colPivHouseholderQr().solve(b_eigen);\n\n    // Copy the result to x\n    for (size_t i = 0; i < N; i++) {\n      x[i] = x_eigen(i);\n    }\n  }\n\n  // Broadcast the result to all other ranks\n  MPI_Bcast(&x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; j++) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0;\n    for (int j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(&b[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&local_x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "cuda",
+        "prompt": "/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\n__global__ void solveLinearSystem(const double *A, const double *b, double *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int i = 0; i < N; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            ":",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) {\n    return;\n  }\n  double sum = 0;\n  for (int i = 0; i < N; i++) {\n    sum += A[row * N + i] * x[i];\n  }\n  x[row] = (b[row] - sum) / A[row * N + col];\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    for (int k = 0; k < N; k++) {\n      x[row] -= A[row * N + k] * b[k];\n    }\n    x[row] /= A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  } else if (row > col) {\n    double sum = 0;\n    for (int i = 0; i < col; i++) {\n      sum += A[row * N + i] * x[i];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n    for (int i = col + 1; i < row; i++) {\n      A[row * N + i] -= A[row * N + col] * A[i * N + col] / A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; ++k) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + row];\n  } else {\n    x[row] -= A[row * N + col] * x[col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    x[row] = b[row] / A[row * N + col];\n  }\n  for (int i = col + 1; i < N; i++) {\n    x[row] -= A[row * N + i] * x[i] / A[row * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    double sum = 0;\n    for (int k = 0; k < N; k++) {\n      sum += A[row * N + k] * x[k];\n    }\n    x[row] = (b[row] - sum) / A[row * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "01_dense_la_solve",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Solve the linear system Ax=b for x.\n   A is an NxN matrix in row-major. x and b have N elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and b. Store the result in x on rank 0.\n   Example:\n   \n   input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n   output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector<double> const& A, std::vector<double> const& b, std::vector<double> &x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Initialize x to 0 on all ranks\n  x.assign(N, 0.0);\n\n  // Solve the system using Gaussian elimination\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row\n    if (pivot_row != i) {\n      for (size_t j = 0; j < N; ++j) {\n        std::swap(A[i * N + j], A[pivot_row * N + j]);\n      }\n      std::swap(b[i], b[pivot_row]);\n    }\n\n    // Eliminate the current column\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(&x[0], N, MPI_DOUBLE, &x[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank solves for a portion of x\n  #pragma omp parallel for\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data() + rank * N / size, N / size, MPI_DOUBLE, x.data(), N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Compute the inverse of A on rank 0.\n  std::vector<double> invA;\n  if (my_rank == 0) {\n    invA.resize(N * N);\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] = (i == j) ? 1.0 : 0.0;\n      }\n    }\n    for (size_t i = 0; i < N; ++i) {\n      double inv = 1.0 / A[i * N + i];\n      for (size_t j = 0; j < N; ++j) {\n        invA[i * N + j] *= inv;\n        A[i * N + j] *= inv;\n      }\n      b[i] *= inv;\n    }\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = i + 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n    for (size_t i = N - 1; i > 0; --i) {\n      for (size_t j = i - 1; j < N; ++j) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = 0; k < N; ++k) {\n          A[j * N + k] -= factor * A[i * N + k];\n          invA[j * N + k] -= factor * invA[i * N + k];\n        }\n        b[j] -= factor * b[i];\n      }\n    }\n  }\n\n  // Broadcast the inverse of A to all ranks.\n  MPI_Bcast(invA.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute x on all ranks.\n  x.resize(N);\n  for (size_t i = 0; i < N; ++i) {\n    x[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      x[i] += invA[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the results to rank 0.\n  if (my_rank == 0) {\n    for (size_t i = 1; i < num_ranks; ++i) {\n      std::vector<double> recv_x(N);\n      MPI_Recv(recv_x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N; ++j) {\n        x[j] += recv_x[j];\n      }\n    }\n  } else {\n    MPI_Send(x.data(), N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Rank 0 gathers the solution from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&x[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&x[rank * N], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Normalize the ith row.\n    double factor = A[i * N + i];\n    for (size_t j = 0; j < N; ++j) {\n      A[i * N + j] /= factor;\n    }\n    b[i] /= factor;\n\n    // Subtract the ith row from all other rows.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    x[i] = b[i];\n    for (size_t j = i + 1; j < N; ++j) {\n      x[i] -= A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N);\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Gather the local solutions into the global solution\n  MPI_Allgather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, MPI_COMM_WORLD);\n\n  // Only rank 0 has the complete solution\n  if (rank == 0) {\n    for (size_t i = 1; i < size; i++) {\n      for (size_t j = 0; j < N; j++) {\n        x[j] += local_x[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves the system for a subset of the rows\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Solve the system for the subset of rows\n  std::vector<double> localX(endRow - startRow);\n  for (size_t i = startRow; i < endRow; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    localX[i - startRow] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localX.data(), endRow - startRow, MPI_DOUBLE, x.data(), endRow - startRow, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  \n  // Each rank computes its part of the solution\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n  \n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system on each rank\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the solution\n  std::vector<double> local_x(N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the local solutions into x on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Check if the input is valid\n  if (A.size() != N * N || b.size() != N || x.size() != N) {\n    throw std::invalid_argument(\"Invalid input\");\n  }\n\n  // Initialize x to 0\n  std::fill(x.begin(), x.end(), 0.0);\n\n  // Solve the linear system using Gaussian elimination\n  for (size_t i = 0; i < N; i++) {\n    // Normalize the ith row\n    double factor = 1.0 / A[i * N + i];\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] *= factor;\n    }\n    b[i] *= factor;\n\n    // Subtract the ith row from all other rows\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = 0; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x\n  for (int i = N - 1; i >= 0; i--) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel for\n  for (int i = 0; i < N; i++) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(x.data(), N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the correct result\n  if (rank == 0) {\n    x = local_x;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local portion of the solution\n  std::vector<double> local_x(N, 0.0);\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; ++j) {\n      local_x[i] += A[i * N + j] * b[j];\n    }\n  }\n\n  // Reduce the local solutions to get the global solution\n  MPI_Reduce(local_x.data(), x.data(), N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank solves for a subset of the rows of x.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Solve for the subset of rows of x on this rank.\n  for (size_t i = start_row; i < end_row; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(x.data() + start_row, end_row - start_row, MPI_DOUBLE,\n             x.data(), end_row - start_row, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix\n  size_t local_N = (N + size - 1) / size;\n\n  // Allocate memory for the local matrix and vector\n  std::vector<double> local_A(local_N * N);\n  std::vector<double> local_b(local_N);\n\n  // Scatter the matrix and vector to the local arrays\n  MPI_Scatter(A.data(), N * N, MPI_DOUBLE, local_A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Scatter(b.data(), N, MPI_DOUBLE, local_b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the local linear system\n  std::vector<double> local_x(local_N);\n  for (size_t i = 0; i < local_N; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += local_A[i * N + j] * local_x[j];\n    }\n    local_x[i] = (local_b[i] - sum) / local_A[i * N + i];\n  }\n\n  // Gather the local solutions to the global vector\n  MPI_Gather(local_x.data(), local_N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Normalize the global solution\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      x[i] /= size;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Solve the system on each rank\n  std::vector<double> local_x(N);\n  for (size_t i = 0; i < N; ++i) {\n    local_x[i] = b[i];\n    for (size_t j = 0; j < i; ++j) {\n      local_x[i] -= A[i * N + j] * x[j];\n    }\n  }\n  for (size_t i = N - 1; i >= 0; --i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      local_x[i] -= A[i * N + j] * local_x[j];\n    }\n    local_x[i] /= A[i * N + i];\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast b to all ranks\n  MPI_Bcast(b.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Solve the linear system in parallel\n  #pragma omp parallel\n  {\n    // Each thread solves a part of the system\n    #pragma omp for\n    for (size_t i = 0; i < N; ++i) {\n      double sum = 0;\n      for (size_t j = 0; j < N; ++j) {\n        sum += A[i * N + j] * x[j];\n      }\n      x[i] = (b[i] - sum) / A[i * N + i];\n    }\n  }\n\n  // Gather the solution from all ranks to rank 0\n  MPI_Gather(x.data(), N, MPI_DOUBLE, x.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Solve the linear system using Gaussian elimination.\n  for (size_t i = 0; i < N; ++i) {\n    // Find the pivot row.\n    size_t pivot_row = i;\n    for (size_t j = i + 1; j < N; ++j) {\n      if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n        pivot_row = j;\n      }\n    }\n\n    // Swap the pivot row with the current row.\n    for (size_t j = 0; j < N; ++j) {\n      std::swap(A[i * N + j], A[pivot_row * N + j]);\n    }\n    std::swap(b[i], b[pivot_row]);\n\n    // Eliminate the current column.\n    for (size_t j = i + 1; j < N; ++j) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = 0; k < N; ++k) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n      b[j] -= factor * b[i];\n    }\n  }\n\n  // Back-substitute to solve for x.\n  for (size_t i = N - 1; i >= 0; --i) {\n    double sum = 0.0;\n    for (size_t j = i + 1; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    x[i] = (b[i] - sum) / A[i * N + i];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(Kokkos::View<const double**> &A, Kokkos::View<const double**> &B, Kokkos::View<double**> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (int j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (int k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A(i, k) * B(k, j);\n      }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemm\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A(i, k) * B(k, j); }\n      C(i, j) = sum;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (int i = 0; i < M; ++i) {\n    for (int j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (int k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  C.resize(M * N);\n  omp_set_num_threads(omp_get_max_threads());\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      C[i * N + j] = 0;\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0043886972,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0424053076
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045073926,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.03429623
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044403548,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0344855604
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045316784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0372008322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044862188,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0344702896
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004495148,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0343761584
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044833854,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0357374986
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004433664,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0381165378
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0.0;\n      for (size_t k = 0; k < K; ++k) { sum += A[i * K + k] * B[k * N + j]; }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004504433,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0347508716
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045082884,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0358458182
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044793378,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0343091406
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0043701342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.034102878
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044539078,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0341703034
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004505467,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0364090086
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044995922,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342932324
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044407814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0364760526
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044240596,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0342097114
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00445924,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0343940502
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044744026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0344459478
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0044572144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0340540484
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) { sum += A[row * K + k] * B[k * N + col]; }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0.0;\n  for (int k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  size_t row = blockIdx.x * blockDim.x + threadIdx.x;\n  size_t col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  C[row * N + col] = 0;\n  for (int k = 0; k < K; k++) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the global C on rank 0.\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of processes and the rank of the current process\n  int num_procs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows each process will compute\n  int rows_per_proc = M / num_procs;\n\n  // Compute the starting row for each process\n  int start_row = rank * rows_per_proc;\n\n  // Compute the ending row for each process\n  int end_row = start_row + rows_per_proc;\n\n  // Initialize the result matrix on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n\n  // Compute the partial result on each process\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      double dot_product = 0;\n      for (int k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = dot_product;\n    }\n  }\n\n  // Gather the partial results on rank 0\n  if (rank != 0) {\n    MPI_Gather(&C[start_row * N], rows_per_proc * N, MPI_DOUBLE, &C[0], rows_per_proc * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast M, K, and N to all ranks.\n  MPI_Bcast(&M, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&K, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&N, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Compute the local portion of C.\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Reduce the local portions of C to rank 0.\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast A and B to all ranks\n  MPI_Bcast(&A[0], M * K, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  MPI_Bcast(&B[0], K * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute C on each rank\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather C from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local part of C\n  std::vector<double> local_C(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_C.data(), M * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / num_procs;\n  if (rank == num_procs - 1) {\n    local_M += M % num_procs;\n  }\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 0; i < num_procs; ++i) {\n      size_t local_M = M / num_procs;\n      if (i == num_procs - 1) {\n        local_M += M % num_procs;\n      }\n      MPI_Recv(&C[i * local_M * N], local_M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_C[0], local_M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the local portion of C.\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to the root rank.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      C[i * N + j] = sum;\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(M * N);\n      MPI_Recv(recv_buf.data(), M * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N; j++) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(C.data(), M * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank has a complete copy of A and B.\n  // Store the result in C on rank 0.\n  if (rank == 0) {\n    C.clear();\n    C.reserve(M * N);\n    for (size_t i = 0; i < M; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        double sum = 0;\n        for (size_t k = 0; k < K; ++k) {\n          sum += A[i * K + k] * B[k * N + j];\n        }\n        C.push_back(sum);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local part of C.\n  std::vector<double> localC(M * N, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0.\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the matrix C\n  size_t local_M = M / size;\n  size_t local_N = N;\n\n  // Compute the local part of the matrix C\n  std::vector<double> local_C(local_M * local_N);\n  for (size_t i = 0; i < local_M; ++i) {\n    for (size_t j = 0; j < local_N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * local_N + j] = sum;\n    }\n  }\n\n  // Gather the local parts of the matrix C into the global matrix C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_buf(local_M * local_N);\n      MPI_Recv(recv_buf.data(), local_M * local_N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < local_M * local_N; ++j) {\n        C[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(local_C.data(), local_M * local_N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and the number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N / size + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an MxN grid of threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\n__global__ void gemm(const double *A, const double *B, double *C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t row = blockIdx.x;\n  size_t col = blockIdx.y;\n  size_t i = threadIdx.x;\n  size_t j = threadIdx.y;\n  C[row * N + col] = 0.0;\n  for (size_t k = 0; k < K; ++k) {\n    C[row * N + col] += A[row * K + k] * B[k * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; k++) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < M && col < N) {\n    for (int k = 0; k < K; ++k) {\n      C[row * N + col] += A[row * K + k] * B[k * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n\n  C[row * N + col] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n  double sum = 0;\n  for (k = 0; k < K; k++) {\n    sum += A[i * K + k] * B[k * N + j];\n  }\n  C[i * N + j] = sum;\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  double sum = 0;\n  for (int k = 0; k < K; ++k) {\n    sum += A[row * K + k] * B[k * N + col];\n  }\n  C[row * N + col] = sum;\n}\n",
+            "\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; k++) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.y * blockDim.y + threadIdx.y;\n  int j = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M && j < N) {\n    double sum = 0;\n    for (size_t k = 0; k < K; ++k) {\n      sum += A[i * K + k] * B[k * N + j];\n    }\n    C[i * N + j] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M && col < N) {\n    double sum = 0;\n    for (int k = 0; k < K; ++k) {\n      sum += A[row * K + k] * B[k * N + col];\n    }\n    C[row * N + col] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x;\n  int j = blockIdx.y;\n  int k;\n\n  C[i * N + j] = 0;\n  for (k = 0; k < K; k++) {\n    C[i * N + j] += A[i * K + k] * B[k * N + j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "02_dense_la_gemm",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n   A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and B. Store the result in C on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n   output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector<double> const& A, std::vector<double> const& B, std::vector<double> &C, size_t M, size_t K, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local parts of C to rank 0\n  MPI_Gather(localC.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute the product of A and B in parallel\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(C.data(), M * N, MPI_DOUBLE, C.data(), M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the product of A and B\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  // Initialize C to zero\n  C.assign(M * N, 0.0);\n\n  // Compute C using MPI and OpenMP\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce C to rank 0\n  MPI_Reduce(C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.assign(M * N, 0);\n\n  // Compute the partial sum for each element of C\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the partial sums from all ranks to rank 0\n  MPI_Gather(&C[0], M * N, MPI_DOUBLE, &C[0], M * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize C to 0\n  C.resize(M * N);\n  std::fill(C.begin(), C.end(), 0);\n\n  // Compute the local part of C\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(MPI_IN_PLACE, C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local part of C\n  std::vector<double> localC(M * N);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local parts of C to rank 0\n  MPI_Reduce(localC.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        localC[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(localC.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M * N / size; j++) {\n        C[j] += recvC[j];\n      }\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> localC(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      for (size_t k = 0; k < K; ++k) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      localC[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recvC(M * N / size);\n      MPI_Recv(recvC.data(), M * N / size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::copy(recvC.begin(), recvC.end(), C.begin() + i * M * N / size);\n    }\n  } else {\n    MPI_Send(localC.data(), M * N / size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  // Initialize C to 0\n  C.assign(M * N, 0.0);\n\n  // Perform the matrix multiplication in parallel\n  #pragma opm parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result matrix\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t k = 0; k < K; k++) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Gather the local results into the global result matrix\n  MPI_Gather(&local_C[0], M * N / size, MPI_DOUBLE, &C[0], M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double dot_product = 0.0;\n      for (size_t k = 0; k < K; ++k) {\n        dot_product += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = dot_product;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  if (rank == 0) {\n    C.resize(M * N);\n  }\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N, 0.0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t k = 0; k < K; ++k) {\n        local_C[i * N + j] += A[i * K + k] * B[k * N + j];\n      }\n    }\n  }\n\n  // Reduce the local portions of C to rank 0\n  MPI_Reduce(local_C.data(), C.data(), M * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of C\n  std::vector<double> local_C(M * N / size);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < K; k++) {\n        sum += A[i * K + k] * B[k * N + j];\n      }\n      local_C[i * N / size + j] = sum;\n    }\n  }\n\n  // Gather the local portions of C into the global C on rank 0\n  MPI_Gather(local_C.data(), M * N / size, MPI_DOUBLE, C.data(), M * N / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix, x has N elements, and y has M elements.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(Kokkos::View<const double**> &A, Kokkos::View<const double*> &x, Kokkos::View<double*> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A(i, j) * x[j]; }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M),\n                       KOKKOS_LAMBDA(const size_t i) {\n                         y(i) = 0;\n                         for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "mv",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    double dot = 0.0;\n    for (size_t j = 0; j < N; ++j) { dot += A(i, j) * x(j); }\n    y(i) = dot;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA(const int i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<>(0, M), KOKKOS_LAMBDA (const int i) {\n    y(i) = 0;\n    for (int j = 0; j < N; ++j) {\n      y(i) += A(i, j) * x(j);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const int i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    y(i) = 0;\n    for (size_t j = 0; j < N; ++j) { y(i) += A(i, j) * x(j); }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gemv\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, M), KOKKOS_LAMBDA(const size_t i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) { sum += A(i, j) * x(j); }\n    y(i) = sum;\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma openmp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  // Initialize y to 0\n  y.assign(M, 0);\n\n  // Compute the matrix-vector product in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double dot_product = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot_product += A[i * N + j] * x[j];\n    }\n    y[i] = dot_product;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "serial",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0327138536,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0326629222
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328638118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327700494
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) { dot += A[i * N + j] * x[j]; }\n    y[i] = dot;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328235142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0322337782
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032793678,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321468388
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032929105,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328296416
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0329034886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0329605462
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032905268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0320273598
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328178416,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327779064
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0330067588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327323126
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328715988,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0329001924
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328108382,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327738126
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0329496858,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032739973
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0330101676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0322008402
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.032774574,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0329121608
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0327475594,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321491232
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0329515048,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.032901663
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0329016732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0327594938
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    double dot = 0;\n    for (size_t j = 0; j < N; ++j) {\n      dot += A[i * N + j] * x[j];\n    }\n    y[i] = dot;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0327543042,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321453218
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; i++) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0326714302,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0328819362
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < M; ++i) {\n    y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0328199112,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0326586874
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "hip",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int i = bid * block_size + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < M && j < N) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; ++col) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) {\n      y[tid] += A[tid * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  y[i] = 0;\n  for (int j = 0; j < N; j++) {\n    y[i] += A[i * N + j] * x[j];\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; ++col) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0.0;\n    for (int j = 0; j < N; j++) { y[tid] += A[tid * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n  int i = tid + bid * blockSize;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = bid * hipBlockDim_x + tid;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  size_t i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (size_t j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  double sum = 0;\n  for (int i = tid; i < M; i += blockSize) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n  y[tid] = sum;\n}\n",
+            "\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gridSize = hipGridDim_x;\n  int blockSize = hipBlockDim_x;\n\n  y[tid] = 0;\n  for (int j = 0; j < N; j++) {\n    y[tid] += A[tid + bid * blockSize + j * M] * x[j];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector multiplication\n  std::vector<double> local_y(M / size);\n  for (size_t i = rank * (M / size); i < (rank + 1) * (M / size); ++i) {\n    local_y[i - rank * (M / size)] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i - rank * (M / size)] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the partial dot product on each rank\n  std::vector<double> partial_y(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial dot products to get the final result\n  MPI_Reduce(partial_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Only rank 0 has the final result\n  if (rank != 0) {\n    y.clear();\n  }\n}\n",
+            "(",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product.\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the local results into the global vector y on rank 0.\n  MPI_Gather(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M / size);\n  for (size_t i = 0; i < M / size; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), M / size, MPI_DOUBLE, y.data(), M / size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Each rank computes a partial sum of the dot product\n  std::vector<double> partial_sums(M, 0.0);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      partial_sums[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the partial sums to rank 0\n  std::vector<double> global_sums(M, 0.0);\n  MPI_Reduce(partial_sums.data(), global_sums.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n\n  // Store the result in y on rank 0\n  if (rank == 0) {\n    y = global_sums;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    y = local_y;\n    for (int i = 1; i < size; ++i) {\n      std::vector<double> recv_y(M);\n      MPI_Recv(recv_y.data(), M, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < M; ++j) {\n        y[j] += recv_y[j];\n      }\n    }\n  } else {\n    MPI_Send(local_y.data(), M, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = rank; i < M; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, comm);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0.0;\n    for (size_t j = 0; j < N; ++j) { local_y[i] += A[i * N + j] * x[j]; }\n  }\n\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the dot product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local dot products into the global vector y on rank 0\n  MPI_Reduce(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "cuda",
+        "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use CUDA to compute in parallel. The kernel is launched with at least M threads.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\n__global__ void gemv(const double *A, const double *x, double *y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int tid = block.thread.x;\n  int row = block.block.x;\n  double sum = 0;\n  for (int col = 0; col < N; col++) {\n    sum += A[row * N + col] * x[col];\n  }\n  y[row] = sum;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) {\n      y[row] += A[row * N + col] * x[col];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < M) {\n    y[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      y[tid] += A[tid * N + i] * x[i];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  for (int i = bid * bid_size + tid; i < M; i += bid_size) {\n    double sum = 0;\n    for (int j = 0; j < N; j++) { sum += A[i * N + j] * x[j]; }\n    y[i] = sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int k = 0; k < N; k++) {\n    y[row] += A[row * N + k] * x[k];\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = threadIdx.x;\n  y[row] = 0;\n  for (int i = 0; i < N; i++) {\n    y[row] += A[row * N + i] * x[i];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n\n  double sum = 0;\n\n  for (int i = bid * bid_size + tid; i < M; i += grid_size * bid_size) {\n    for (int j = 0; j < N; j++) {\n      sum += A[i * N + j] * x[j];\n    }\n  }\n\n  y[tid] = sum;\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < M) {\n    double sum = 0;\n    for (int col = 0; col < N; col++) {\n      sum += A[row * N + col] * x[col];\n    }\n    y[row] = sum;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < M) {\n    y[i] = 0;\n    for (int j = 0; j < N; j++) { y[i] += A[i * N + j] * x[j]; }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int row = bid * blockDim.x + tid;\n  if (row < M) {\n    y[row] = 0;\n    for (int col = 0; col < N; col++) { y[row] += A[row * N + col] * x[col]; }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "04_dense_la_gemv",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Multiply the matrix A by the vector x. Store the results in the vector y.\n   A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A and x. Store the result in y on rank 0.\n   Example:\n\n   input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n   output: y=[1, -3]\n*/\nvoid gemv(std::vector<double> const& A, std::vector<double> const& x, std::vector<double> &y, size_t M, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of y\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0.0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; i++) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M, 0.0);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to the global y on rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root process\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to the root rank\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute local dot products\n  std::vector<double> local_y(M, 0.0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce local dot products to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the matrix-vector product\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(local_y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Each rank computes its part of the result\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    double sum = 0;\n    for (size_t j = 0; j < N; ++j) {\n      sum += A[i * N + j] * x[j];\n    }\n    y[i] = sum;\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local portion of the result\n  std::vector<double> local_y(M);\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of y\n  std::vector<double> local_y(M);\n  for (size_t i = rank * M / size; i < (rank + 1) * M / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the local portions of y to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize y to 0 on all ranks\n  y.assign(M, 0.0);\n\n  // Compute the local portion of y on each rank\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  MPI_Gather(y.data(), M, MPI_DOUBLE, y.data(), M, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its part of the result\n  std::vector<double> local_y(M);\n  #pragma omp parallel for\n  for (size_t i = 0; i < M; ++i) {\n    local_y[i] = 0;\n    for (size_t j = 0; j < N; ++j) {\n      local_y[i] += A[i * N + j] * x[j];\n    }\n  }\n\n  // Reduce the results to rank 0\n  MPI_Reduce(local_y.data(), y.data(), M, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(Kokkos::View<double**> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) -= A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) -= A(j, i) * A(i, i);\n    }\n  });\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Iterate over the columns of the matrix\n  for (size_t j = 0; j < N; ++j) {\n    // Divide the elements of the current column by the diagonal element\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n      A(i, j) /= A(j, j);\n    });\n\n    // Subtract the current column from the remaining columns\n    for (size_t k = j + 1; k < N; ++k) {\n      Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n        A(i, k) -= A(i, j) * A(j, k);\n      });\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A(i, j) /= A(i, i);\n      } else if (i > j) {\n        A(i, j) -= A(i, j) * A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = 0; k < i; k++) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n      A(j, i) = multiplier;\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j][i] /= A[i][i];\n    }\n    Kok\u30b3\u30af::parallel_for(Kok\u30b3\u30af::RangePolicy<Kok\u30b3\u30af::IndexType>(i + 1, N),\n                        KOK_LAMBDA(const Kok\u30b3\u30af::IndexType &j) {\n                          for (size_t k = 0; k < i; k++) {\n                            A[j][i] -= A[j][k] * A[i][k];\n                          }\n                        });\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      double multiplier = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; ++k) {\n        A(j, k) -= multiplier * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) = A(j, i) / A(i, i);\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A(k, j) = A(k, j) - A(k, i) * A(i, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double m = A(j, i) / A(i, i);\n      for (size_t k = i; k < N; k++) {\n        A(j, k) -= m * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < i; ++j) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; ++k) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; ++j) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; ++k) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        double sum = 0;\n        for (size_t k = 0; k < i; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) -= sum;\n      } else if (i > j) {\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A(i, k) * A(k, j);\n        }\n        A(i, j) = (A(i, j) - sum) / A(j, j);\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A(i, j) /= A(j, j);\n      for (size_t k = j + 1; k < N; k++) {\n        A(i, k) -= A(i, j) * A(j, k);\n      }\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      A(j, i) /= A(i, i);\n      for (size_t k = i + 1; k < N; k++) {\n        A(j, k) -= A(j, i) * A(i, k);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the kth row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; j++) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Normalize the k-th row\n    double norm = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k; j < N; ++j) {\n      A[k * N + j] /= norm;\n    }\n\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      double factor = A[i * N + k];\n      for (size_t j = k; j < N; ++j) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= A[i * N + k] * factor;\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the diagonal element\n    double factor = A[i * N + i];\n#pragma omp parallel for\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= factor;\n    }\n\n    // Subtract multiples of the current row from all the rows below it\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n#pragma omp parallel for\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n#pragma omp parallel for\n      for (size_t k = 0; k < i; k++) {\n        A[j * N + i] -= A[j * N + k] * A[k * N + i];\n      }\n      A[j * N + i] /= A[i * N + i];\n    }\n  }\n}\n",
+            "\n\n  for (size_t k = 0; k < N; ++k) {\n    // Compute the multiplier for row k\n    double multiplier = A[k * N + k];\n#pragma omp parallel for\n    for (size_t j = k + 1; j < N; ++j) {\n      A[j * N + k] /= multiplier;\n    }\n    // Update the remaining rows\n#pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n    }\n    #pragma omp parallel for\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "serial",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1075916676,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274379444
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1061161848,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1263754024
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1057003722,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1265544752
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + i];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + i] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; ++k) {\n      for (size_t j = i + 1; j < N; ++j) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = i + 1; j < N; ++j) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; ++k) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.107588167,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127289751
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.10669599,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127545849
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1065689932,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1270394218
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] = A[j * N + i] / A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] = A[j * N + k] - A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1066094752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1273950572
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.106995225,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1275457688
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      } else {\n        A[i * N + j] /= A[j * N + j];\n        A[i * N + j] *= -1;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.106741235,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127421363
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1072817784,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1282346132
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1073099974,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1279226962
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.107300201,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.127243534
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < j; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = (A[i * N + j] - sum) / A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      double sum = 0;\n      for (size_t k = 0; k < i; k++) {\n        sum += A[i * N + k] * A[k * N + j];\n      }\n      A[i * N + j] = A[i * N + j] - sum;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1074058278,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274312534
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.107215107,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.1274179222
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j][i] / A[i][i];\n      for (size_t k = i; k < N; k++) {\n        A[j][k] -= multiplier * A[i][k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1;\n      } else if (i > j) {\n        A[i * N + j] = 0;\n      }\n    }\n  }\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "hip",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < N && col < N) {\n    if (row == col) {\n      A[row * N + col] /= A[row * N + row];\n    } else if (row > col) {\n      A[row * N + col] -= A[row * N + col] * A[col * N + col];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = A[row * N + col] - sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x + blockIdx.y * gridDim.x;\n  int col = threadIdx.x + threadIdx.y * blockDim.x;\n  if (row < col) {\n    A[row * N + col] = 0.0;\n  } else if (row == col) {\n    A[row * N + col] = 1.0;\n  } else {\n    A[row * N + col] = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= A[row * N + col] * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double temp = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= temp * A[j * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[col * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x + blockIdx.y * gridDim.x;\n  int i = tid / N;\n  int j = tid % N;\n  if (i < j) {\n    A[i * N + j] /= A[i * N + i];\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] -= A[k * N + i] * A[i * N + j]; }\n  } else if (i == j) {\n    A[i * N + j] = 1;\n    for (int k = i + 1; k < N; ++k) { A[k * N + j] /= A[i * N + i]; }\n  }\n}\n",
+            "\n\n  int row = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int col = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    double pivot = A[row * N + col];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] /= pivot;\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int k = row + 1; k < N; k++) {\n      A[k * N + col] -= A[k * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + row] * A[col * N + col];\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    if (i == j) {\n      A[i * N + j] /= A[i * N + i];\n      for (int k = i + 1; k < N; k++) {\n        A[k * N + i] /= A[i * N + i];\n      }\n    } else if (i > j) {\n      double factor = A[i * N + j];\n      for (int k = j; k < N; k++) {\n        A[i * N + k] -= factor * A[j * N + k];\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t k = 0; k < N; k++) {\n    // Broadcast the k-th row to all other ranks\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, k, MPI_COMM_WORLD);\n\n    // For each row below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      // Compute the multiplier\n      double multiplier = A[i * N + k] / A[k * N + k];\n\n      // Subtract the k-th row from the i-th row\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= multiplier * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // For each column j in row i\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] -= A[i * N + k] * A[k * N + j];\n        }\n      }\n      // If i <= j, then A[i][j] is in the upper triangular matrix U\n      else {\n        // Divide the ith row of A by the jth column of L\n        for (size_t k = 0; k < j; k++) {\n          A[i * N + j] /= A[j * N + k];\n        }\n\n        // Subtract the product of the ith row of L and the jth column of U from the ith row of A\n        for (size_t k = j + 1; k < N; k++) {\n          A[i * N + k] -= A[i * N + j] * A[j * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations to eliminate elements below the diagonal\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in the lower triangular matrix L\n      if (i > j) {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j]\n        A[i * N + j] -= A[i * N] * A[j];\n      }\n      // Otherwise, A[i][j] is in the upper triangular matrix U\n      else {\n        // Compute A[i][j] as A[i][j] - A[i][0] * A[0][j] / A[0][0]\n        A[i * N + j] -= A[i * N] * A[j] / A[0];\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N + i], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // For each column j\n    for (size_t j = 0; j < N; j++) {\n      // If i > j, then A[i][j] is in L\n      if (i > j) {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = (A[i][j] - sum) / A[j][j];\n      }\n      // If i <= j, then A[i][j] is in U\n      else {\n        // Compute A[i][j]\n        double sum = 0;\n        for (size_t k = 0; k < j; k++) {\n          sum += A[i][k] * A[k][j];\n        }\n        A[i][j] = A[i][j] - sum;\n      }\n    }\n\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= A[k * N + i] * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from rank 0 to all other ranks\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Normalize the k-th row of A\n    double pivot = A[k * N + k];\n    for (size_t j = k + 1; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Update the remaining rows of A\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the ith row of A to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[j * N + i] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Forward elimination\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all other ranks\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i\n  for (size_t i = 1; i < N; i++) {\n    // Rank 0 broadcasts the ith row of L to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each column j\n    for (size_t j = i; j < N; j++) {\n      // Compute A[i][j] -= A[i][k] * A[k][j] for k < i\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n\n      // Divide A[i][j] by A[i][i]\n      A[i * N + j] /= A[i * N + i];\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] /= A[i * N + j];\n      } else if (i > j) {\n        A[i * N + j] -= A[i * N + j] * A[j * N + j];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(&A[0], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // If rank 0, compute the multiplier for row i\n    double multiplier = 0;\n    if (rank == 0) {\n      multiplier = A[i * N] / A[0];\n    }\n\n    // Broadcast the multiplier to all ranks\n    MPI_Bcast(&multiplier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // For each element in row i, subtract the corresponding element in row 0 multiplied by the multiplier\n    for (size_t j = 0; j < N; j++) {\n      A[i * N + j] -= multiplier * A[j];\n    }\n  }\n\n  // If rank 0, divide each element in row 0 by A[0][0]\n  if (rank == 0) {\n    for (size_t j = 0; j < N; j++) {\n      A[j] /= A[0];\n    }\n  }\n\n  // For each row i in A\n  for (size_t i = 1; i < N; i++) {\n    // For each element in row i, divide by A[i][i]\n    for (size_t j = i; j < N; j++) {\n      A[i * N + j] /= A[i * N + i];\n    }\n\n    // For each row k after row i, subtract the corresponding element in row i multiplied by A[k][i]\n    for (size_t k = i + 1; k < N; k++) {\n      double multiplier = A[k * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[k * N + j] -= multiplier * A[i * N + j];\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  for (size_t i = 0; i < N; i++) {\n    if (rank == 0) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n    }\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    for (size_t j = i + 1; j < N; j++) {\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Broadcast the first row of A to all ranks\n  MPI_Bcast(A.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Each rank computes its own row of L and U\n  for (size_t i = 1; i < N; i++) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n    }\n    for (size_t j = i; j < N; j++) {\n      for (size_t k = 0; k < i; k++) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks into rank 0\n  MPI_Gather(A.data(), N * N, MPI_DOUBLE, A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "cuda",
+        "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\n__global__ void luFactorize(double *A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double temp = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= temp * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row == col) {\n    A[row * N + col] /= A[row * N + col];\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row < col) {\n    A[row * N + col] = A[row * N + col] / A[row * N + row];\n  } else if (row == col) {\n    for (int k = 0; k < row; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n  } else {\n    for (int k = 0; k < col; k++) {\n      A[row * N + col] -= A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n  int tid = threadIdx.x;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i == j) {\n    A[i * N + i] /= A[i * N + j];\n    for (int k = i + 1; k < N; k++) {\n      A[k * N + i] /= A[i * N + i];\n    }\n  } else if (i > j) {\n    for (int k = j; k < N; k++) {\n      A[i * N + k] -= A[i * N + j] * A[j * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "(",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row >= N || col >= N) return;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else if (row > col) {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n    A[row * N + col] = factor;\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  } else {\n    double factor = A[row * N + col] / A[col * N + col];\n    for (int k = col + 1; k < N; k++) {\n      A[row * N + k] -= factor * A[col * N + k];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.x;\n  int col = blockIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + col] -= A[i * N + row] * A[row * N + col];\n    }\n  } else if (row > col) {\n    A[row * N + col] -= A[row * N + col] * A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] = (A[row * N + col] - sum) / A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.x * blockDim.x + threadIdx.x;\n  int col = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (row == col) {\n    A[row * N + col] /= A[row * N + row];\n    for (int i = row + 1; i < N; i++) {\n      A[i * N + row] /= A[row * N + row];\n    }\n  } else if (row > col) {\n    double factor = A[row * N + col];\n    for (int i = col + 1; i < N; i++) {\n      A[row * N + i] -= factor * A[col * N + i];\n    }\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] /= A[row * N + row];\n  } else if (row > col) {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < col) {\n    A[row * N + col] = 0;\n  } else if (row == col) {\n    double sum = 0;\n    for (int k = 0; k < row; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] = 1 / A[row * N + col];\n  } else {\n    double sum = 0;\n    for (int k = 0; k < col; k++) {\n      sum += A[row * N + k] * A[k * N + col];\n    }\n    A[row * N + col] -= sum;\n    A[row * N + col] *= A[col * N + col];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "dense_la",
+        "language": "cpp",
+        "name": "00_dense_la_lu_decomp",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n   Store the results for L and U into the original matrix A. \n   A is an NxN matrix stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of A. Store the result in A on rank 0.\n   Example:\n\n   input: [[4, 3], [6, 3]]\n   output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector<double> &A, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "(5.",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on rank 0\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      // Divide the ith row by A[i][i]\n      double div = A[i * N + i];\n      for (size_t j = i; j < N; j++) {\n        A[i * N + j] /= div;\n      }\n\n      // Subtract the ith row from all rows below it\n      for (size_t k = i + 1; k < N; k++) {\n        double factor = A[k * N + i];\n        for (size_t j = i; j < N; j++) {\n          A[k * N + j] -= factor * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Broadcast the result to all ranks\n  MPI_Bcast(A.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization using Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot if necessary\n    size_t max_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[max_row * N + k])) {\n        max_row = i;\n      }\n    }\n    if (max_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[max_row * N + j]);\n      }\n    }\n\n    // Eliminate the k-th column below the k-th row\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      MPI::COMM_WORLD.Recv(&A[i * N * N], N * N, MPI::DOUBLE, i, 0);\n    }\n  } else {\n    MPI::COMM_WORLD.Send(&A[0], N * N, MPI::DOUBLE, 0, 0);\n  }\n}\n",
+            "\n\n  // MPI rank\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // MPI size\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Number of rows per rank\n  int rows_per_rank = N / size;\n\n  // Local matrix\n  std::vector<double> local_A(rows_per_rank * N);\n\n  // Scatter the matrix to all ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Local lu factorization\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      local_A[i * N + j] /= local_A[i * N + i];\n    }\n    for (size_t k = i + 1; k < rows_per_rank; k++) {\n      for (size_t j = i + 1; j < N; j++) {\n        local_A[k * N + j] -= local_A[k * N + i] * local_A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the results back to rank 0\n  MPI_Gather(&local_A[0], rows_per_rank * N, MPI_DOUBLE, &A[0], rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row to all other ranks\n    if (rank == 0) {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    } else {\n      MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n    }\n\n    // Each rank performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double multiplier = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= multiplier * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of rows\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      // Pivot row\n      size_t pivot_row = i;\n      for (size_t j = i + 1; j < N; j++) {\n        if (std::abs(A[j * N + i]) > std::abs(A[pivot_row * N + i])) {\n          pivot_row = j;\n        }\n      }\n\n      // Swap pivot row with current row\n      if (pivot_row != i) {\n        for (size_t k = 0; k < N; k++) {\n          std::swap(A[i * N + k], A[pivot_row * N + k]);\n        }\n      }\n\n      // Eliminate lower triangular elements\n      for (size_t j = i + 1; j < N; j++) {\n        double factor = A[j * N + i] / A[i * N + i];\n        for (size_t k = i; k < N; k++) {\n          A[j * N + k] -= factor * A[i * N + k];\n        }\n      }\n    }\n  }\n\n  // Gather the factorized matrix from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs row operations on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the updated matrix A from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    // Divide the current row by the pivot element\n    double pivot = A[i * N + i];\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the current row from the remaining rows\n    for (size_t k = i + 1; k < N; k++) {\n      double factor = A[k * N + i];\n      for (size_t j = i + 1; j < N; j++) {\n        A[k * N + j] -= factor * A[i * N + j];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (MPI_Comm_rank(MPI_COMM_WORLD) == 0) {\n    for (int i = 1; i < MPI_Comm_size(MPI_COMM_WORLD); i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the rank and number of threads\n    int rank = omp_get_thread_num();\n    int num_threads = omp_get_num_threads();\n\n    // Compute the block size for each thread\n    size_t block_size = N / num_threads;\n\n    // Each thread factorizes its own block of the matrix\n    for (size_t i = rank * block_size; i < (rank + 1) * block_size; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks and store them in A on rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U to the identity matrix\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform Gaussian elimination\n  for (size_t k = 0; k < N; k++) {\n    // Pivot row\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > std::abs(A[pivot_row * N + k])) {\n        pivot_row = i;\n      }\n    }\n\n    // Swap pivot row with row k\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Eliminate non-zero elements below pivot\n    for (size_t i = k + 1; i < N; i++) {\n      double factor = A[i * N + k] / A[k * N + k];\n      for (size_t j = k + 1; j < N; j++) {\n        A[i * N + j] -= factor * A[k * N + j];\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the number of processes\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each process computes a part of the matrix\n  size_t chunk_size = N / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = start_row + chunk_size;\n\n  // Perform LU factorization on the assigned part of the matrix\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < i; j++) {\n      double multiplier = A[i][j] / A[j][j];\n      for (size_t k = j; k < N; k++) {\n        A[i][k] -= multiplier * A[j][k];\n      }\n    }\n    for (size_t j = i; j < N; j++) {\n      A[i][j] /= A[i][i];\n    }\n  }\n\n  // Gather the results from all processes\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * chunk_size * N], chunk_size * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[start_row * N], chunk_size * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Compute the starting row index for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Compute the ending row index for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Factorize the submatrix of A owned by this process\n  for (int i = startRow; i < endRow; ++i) {\n    // Compute the pivot element\n    double pivot = A[i][i];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements below the pivot element\n    for (int j = i + 1; j < N; ++j) {\n      A[j][i] -= pivot * A[j][i] / A[i][i];\n    }\n  }\n\n  // Reduce the matrix A to upper triangular form\n  for (int k = 0; k < N - 1; ++k) {\n    // Compute the pivot element\n    double pivot = A[k][k];\n\n    // Broadcast the pivot element to all other processes\n    MPI_Bcast(&pivot, 1, MPI_DOUBLE, rank, MPI_COMM_WORLD);\n\n    // Eliminate the elements above the pivot element\n    for (int i = 0; i < k; ++i) {\n      A[i][k] -= pivot * A[i][k] / A[k][k];\n    }\n  }\n\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization on each row\n  for (size_t i = 0; i < N; i++) {\n    // Broadcast the pivot row to all ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, i, MPI_COMM_WORLD);\n\n    // Perform row operations on all ranks\n    #pragma omp parallel for\n    for (size_t j = i + 1; j < N; j++) {\n      double factor = A[j * N + i] / A[i * N + i];\n      for (size_t k = i; k < N; k++) {\n        A[j * N + k] -= factor * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&A[i * N], N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&A[0], N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Perform LU factorization\n  for (size_t i = 0; i < N; i++) {\n    // Rank 0 broadcasts the ith row of A to all other ranks\n    MPI_Bcast(&A[i * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Rank 0 performs the row reduction on the ith row\n    for (size_t j = i + 1; j < N; j++) {\n      A[j * N + i] /= A[i * N + i];\n      for (size_t k = i + 1; k < N; k++) {\n        A[j * N + k] -= A[j * N + i] * A[i * N + k];\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  if (rank != 0) {\n    MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Broadcast the matrix A from rank 0 to all other ranks\n  MPI_Bcast(&A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Perform LU factorization in parallel using OpenMP\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = i + 1; j < N; j++) {\n        A[i * N + j] /= A[i * N + i];\n      }\n      for (size_t k = i + 1; k < N; k++) {\n        for (size_t j = i + 1; j < N; j++) {\n          A[k * N + j] -= A[k * N + i] * A[i * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result from all ranks to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int myRank, numProcs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);\n\n  // Each rank computes its own part of the matrix\n  #pragma omp parallel for\n  for (size_t i = myRank; i < N; i += numProcs) {\n    for (size_t j = 0; j < i; j++) {\n      A[i * N + j] /= A[j * N + j];\n      for (size_t k = j + 1; k < N; k++) {\n        A[i * N + k] -= A[i * N + j] * A[j * N + k];\n      }\n    }\n  }\n\n  // Gather the results to rank 0\n  MPI_Gather(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Initialize L and U\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (i == j) {\n        A[i * N + j] = 1.0;\n      } else {\n        A[i * N + j] = 0.0;\n      }\n    }\n  }\n\n  // Perform LU factorization\n  for (size_t k = 0; k < N; k++) {\n    // Find the pivot element\n    double max = std::abs(A[k * N + k]);\n    size_t pivot_row = k;\n    for (size_t i = k + 1; i < N; i++) {\n      if (std::abs(A[i * N + k]) > max) {\n        max = std::abs(A[i * N + k]);\n        pivot_row = i;\n      }\n    }\n\n    // Swap the pivot row with the k-th row\n    if (pivot_row != k) {\n      for (size_t j = 0; j < N; j++) {\n        std::swap(A[k * N + j], A[pivot_row * N + j]);\n      }\n    }\n\n    // Divide the pivot row by the pivot element\n    double pivot = A[k * N + k];\n    for (size_t j = 0; j < N; j++) {\n      A[k * N + j] /= pivot;\n    }\n\n    // Subtract multiples of the pivot row from the other rows\n    for (size_t i = 0; i < N; i++) {\n      if (i != k) {\n        double factor = A[i * N + k];\n        for (size_t j = 0; j < N; j++) {\n          A[i * N + j] -= factor * A[k * N + j];\n        }\n      }\n    }\n  }\n\n  // Gather the result to rank 0\n  if (rank != 0) {\n    MPI_Send(&A[0], N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      std::vector<double> recv_buf(N * N);\n      MPI_Recv(&recv_buf[0], N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        A[j] += recv_buf[j];\n      }\n    }\n  }\n}\n",
+            "\n\n  // Copy the matrix A to rank 0\n  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_WORLD);\n\n  // Perform LU factorization on rank 0\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      A[i * N + j] -= A[i * N + i] * A[j * N + i];\n    }\n\n    for (size_t k = i + 1; k < N; k++) {\n      for (size_t l = i + 1; l < N; l++) {\n        A[k * N + l] -= A[k * N + i] * A[l * N + i];\n      }\n    }\n  }\n\n  // Scatter the factorized matrix A from rank 0 to all other processes\n  MPI_Scatter(&A[0], N * N, MPI_DOUBLE, &A[0], N * N, MPI_DOUBLE, 0, MPI_WORLD);\n}\n",
+            "\n  for (size_t k = 0; k < N; ++k) {\n    // MPI_Bcast the k-th row of A to all ranks.\n    MPI_Bcast(&A[k * N], N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n    // Update the k-th row of A on all ranks.\n    for (size_t j = k + 1; j < N; ++j) {\n      A[k * N + j] /= A[k * N + k];\n    }\n\n    // Update the remaining rows of A on all ranks.\n    #pragma omp parallel for\n    for (size_t i = k + 1; i < N; ++i) {\n      for (size_t j = k + 1; j < N; ++j) {\n        A[i * N + j] -= A[i * N + k] * A[k * N + j];\n      }\n    }\n  }\n\n  // MPI_Reduce the updated A from all ranks to rank 0.\n  MPI_Reduce(&A[0], &A[0], N * N, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* For each value in the array x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(Kokkos::View<const float*> const& x, Kokkos::View<size_t*> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", Kokkos::RangePolicy<>(0, x.size()), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ++ranks(i);\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA (const size_t i) {\n    auto sorted_x = x;\n    Kokkos::sort(sorted_x.data(), sorted_x.extent(0));\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t& i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::find(sorted_x, x(i));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x, Kokkos::Ascending());\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  Kokkos::parallel_for(policy_t(x.extent(0)), KOKKOS_LAMBDA(const size_t i) {\n    auto rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) < x(j)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using rank_functor = Kokkos::Functor<policy_type, Kokkos::Rank<float>>;\n\n  Kokkos::parallel_for(\n      \"ranks\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)),\n      KOKKOS_LAMBDA(const size_t i) {\n        ranks(i) = Kokkos::rank(x, x(i), Kokkos::greater<float>());\n      });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    size_t rank = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(j) < x(i)) {\n        ++rank;\n      }\n    }\n    ranks(i) = rank;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.extent(0); ++j) {\n      if (x(i) > x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA(const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) { ranks(i) += 1; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.size(), KOKKOS_LAMBDA (const size_t i) {\n    ranks(i) = 0;\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x(i) < x(j)) {\n        ranks(i) += 1;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = Kokkos::subview(x, Kokkos::ALL, i);\n    Kokkos::sort(sorted_x);\n    ranks(i) = Kokkos::lower_bound(sorted_x, x(i)) - sorted_x.data();\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    std::vector<float> sorted_x(x.data(), x.data() + x.extent(0));\n    std::sort(sorted_x.begin(), sorted_x.end());\n    ranks(i) = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x(i)));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"ranks\", x.extent(0), KOKKOS_LAMBDA(const size_t i) {\n    auto sorted_x = x;\n    std::sort(sorted_x.data(), sorted_x.data() + sorted_x.extent(0));\n    ranks(i) = std::distance(sorted_x.data(),\n                             std::lower_bound(sorted_x.data(), sorted_x.data() + sorted_x.extent(0), x(i)));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use OpenMP to compute in parallel.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs;\n  pairs.reserve(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs.push_back(std::make_pair(x[i], i));\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[j] < x[i]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    for (size_t j = 0; j < n; ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> y(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    y[i] = std::make_pair(x[i], i);\n  }\n  std::sort(y.begin(), y.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[y[i].second] = i;\n  }\n}\n",
+            "\n\n  // Create a copy of the input vector\n  std::vector<float> sorted_x = x;\n\n  // Sort the copy\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks in parallel\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "(",
+            "\n\n  size_t n = x.size();\n  std::vector<std::pair<float, size_t>> pairs(n);\n  for (size_t i = 0; i < n; ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "ranks",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "(",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::vector<size_t> idx(n);\n  std::iota(idx.begin(), idx.end(), 0);\n  std::sort(idx.begin(), idx.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = idx[i];\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] == x[j]) {\n        ranks[i] = j;\n        break;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<size_t> indices(x.size());\n  std::iota(indices.begin(), indices.end(), 0);\n  std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) { return x[i1] < x[i2]; });\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = indices[i];\n  }\n}\n",
+            "\n\n  ranks.resize(x.size());\n  std::vector<std::pair<float, size_t>> pairs(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    pairs[i] = std::make_pair(x[i], i);\n  }\n  std::sort(pairs.begin(), pairs.end());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[pairs[i].second] = i;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "serial",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0126803372,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282919324
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector x\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Create a map from the sorted values to their original indices\n  std::map<float, size_t> sorted_index_map;\n  for (size_t i = 0; i < sorted_x.size(); ++i) {\n    sorted_index_map[sorted_x[i]] = i;\n  }\n\n  // Iterate over the original vector and store the rank of each value\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = sorted_index_map[x[i]];\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::map<float, size_t> value_to_rank;\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = value_to_rank.find(x[i]);\n    if (it == value_to_rank.end()) {\n      value_to_rank[x[i]] = i;\n    }\n  }\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = value_to_rank[x[i]] + 1;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.01251728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280041306
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125357668,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0289420578
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125324446,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279789812
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks.push_back(std::distance(sorted.begin(),\n                                  std::lower_bound(sorted.begin(), sorted.end(), x[i])));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124385016,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281400064
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::find(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125507732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.2927916612
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124805454,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.2957076888
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125063836,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280809832
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(), std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125870122,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281620592
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted = x;\n  std::sort(sorted.begin(), sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted.begin(),\n                             std::lower_bound(sorted.begin(), sorted.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012491001,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279834314
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125691902,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280689282
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125361682,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282835966
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> x_sorted(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]) - x_sorted.begin();\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0124338114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0279818422
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  size_t n = x.size();\n  ranks.resize(n);\n  std::iota(ranks.begin(), ranks.end(), 0);\n  std::sort(ranks.begin(), ranks.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.012481201,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0280517982
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125853268,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 5.2864658724
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0125079132,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0281768492
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "hip",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use AMD HIP to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  size_t tid = hipThreadIdx_x;\n  if (tid < N) {\n    float v = x[tid];\n    size_t r = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < v) r++;\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  const size_t tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float val = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; ++i) {\n    if (x[i] < val) {\n      ++rank;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float value = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *ptr = x;\n    size_t rank = 0;\n    while (ptr < x + N && *ptr <= val) {\n      if (*ptr == val) {\n        rank++;\n      }\n      ptr++;\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float v = x[tid];\n    int r = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < v) {\n        ++r;\n      }\n    }\n    ranks[tid] = r;\n  }\n}\n",
+            "\n\n  int idx = hipBlockIdx.x * hipBlockDim.x + hipThreadIdx.x;\n  if (idx >= N) {\n    return;\n  }\n\n  float my_value = x[idx];\n  int my_rank = 1;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < my_value) {\n      my_rank++;\n    }\n  }\n  ranks[idx] = my_rank;\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n  size_t i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] < v) r++;\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    float *sorted_x = (float *)hipMalloc(N * sizeof(float));\n    hipMemcpy(sorted_x, x, N * sizeof(float), hipMemcpyHostToDevice);\n    hipSort(sorted_x, sorted_x + N, hipSortAscending);\n    for (int i = 0; i < N; i++) {\n      if (sorted_x[i] == val) {\n        ranks[gid] = i;\n        break;\n      }\n    }\n    hipFree(sorted_x);\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = hipThreadIdx_x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  size_t idx = hipThreadIdx_x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x;\n  int tid = hipBlockIdx_x * hipBlockDim_x + idx;\n  if (tid < N) {\n    ranks[tid] = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < x[tid]) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    float val = x[gid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[gid] = rank;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    float val = x[idx];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_SIZE_T, MPI_MAX, 0,\n               MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; i++) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Create a vector of ranks\n  ranks.resize(n);\n\n  // Get the rank of each value in x\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n\n  // Gather the ranks on process 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<size_t> ranks_i;\n      MPI::COMM_WORLD.Recv(ranks_i.data(), ranks_i.size(), MPI::UNSIGNED_LONG, i, 0);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(ranks.data(), ranks.size(), MPI::UNSIGNED_LONG, 0, 0);\n  }\n}\n",
+            "\n\n  // sort a copy of x on each process\n  auto x_sorted = std::vector<float>(x);\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // find the rank of each value in x on each process\n  std::vector<std::vector<size_t>> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    auto it = std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]);\n    local_ranks[i].push_back(std:: distance(x_sorted.begin(), it));\n  }\n\n  // gather the local_ranks on process 0\n  std::gather(local_ranks, std::back_inserter(ranks), 0);\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[i] = std::find(indices.begin(), indices.end(), i) - indices.begin();\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, ranks.size(), MPI_UNSIGNED_LONG, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = 0; j < x.size(); ++j) {\n      if (x[i] < x[j]) {\n        local_ranks[i]++;\n      }\n    }\n  }\n\n  std::vector<size_t> global_ranks(x.size());\n  MPI_Allgather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n                global_ranks.data(), x.size(), MPI_UNSIGNED_LONG, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    ranks = global_ranks;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector of indices\n  std::vector<size_t> indices(n);\n  for (size_t i = 0; i < n; ++i) {\n    indices[i] = i;\n  }\n\n  // Sort the vector of indices based on the values in x\n  std::sort(indices.begin(), indices.end(), [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  // Get the rank of each value in x\n  ranks.resize(n);\n  for (size_t i = 0; i < n; ++i) {\n    ranks[indices[i]] = i;\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    ranks = local_ranks;\n  }\n\n  MPI_Gather(&local_ranks[0], local_ranks.size(), MPI_UNSIGNED_LONG, &ranks[0],\n              ranks.size(), MPI_UNSIGNED_LONG, 0, comm);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(),\n                             std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG,\n               nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = i;\n  }\n\n  std::sort(local_ranks.begin(), local_ranks.end(),\n            [&x](size_t i, size_t j) { return x[i] < x[j]; });\n\n  if (rank == 0) {\n    ranks.resize(x.size());\n  }\n\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n             MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(ranks_i.data(), ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(),\n                                   std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(),\n               MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG,\n               0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the rank of each element in the sorted vector\n  ranks.resize(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> ranks_i;\n      MPI_Recv(&ranks_i[0], ranks_i.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_i.begin(), ranks_i.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<float> local_x = x;\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(),\n                                   std::lower_bound(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Recv(recv_ranks.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD,\n               MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < x.size(); ++j) {\n        ranks[j] += recv_ranks[j];\n      }\n    }\n  } else {\n    MPI_Send(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n  std::sort(x.begin(), x.end());\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::lower_bound(x.begin(), x.end(), x[i]) - x.begin();\n  }\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "cuda",
+        "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use CUDA to compute in parallel. The kernel will be launched with at least as many threads as elements in x.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\n__global__ void ranks(const float *x, size_t *ranks, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = bid * blockDim.x + tid;\n  if (i < N) {\n    float v = x[i];\n    int r = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (x[j] < v) {\n        ++r;\n      }\n    }\n    ranks[i] = r;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    ranks[i] = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[i] < x[j]) {\n        ranks[i]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    ranks[tid] = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ranks[tid]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float value = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < value) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *d_x = (float *)x;\n    for (size_t i = 0; i < N; i++) {\n      if (d_x[i] < val) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) rank++;\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  size_t i = block.thread_id.x + block.block_id.x * block.block_dim.x;\n  if (i < N) {\n    float val = x[i];\n    std::vector<float> y(x, x + N);\n    std::sort(y.begin(), y.end());\n    ranks[i] = std::lower_bound(y.begin(), y.end(), val) - y.begin();\n  }\n}\n",
+            "\n\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    size_t rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[idx] = rank;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  if (i < N) {\n    float v = x[i];\n    size_t r = 0;\n    for (size_t j = 0; j < N; j++) {\n      if (x[j] <= v) {\n        r++;\n      }\n    }\n    ranks[i] = r - 1;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    float val = x[idx];\n    float *sorted_x = new float[N];\n    thrust::copy(x, x + N, sorted_x);\n    thrust::sort(sorted_x, sorted_x + N);\n    ranks[idx] = thrust::lower_bound(sorted_x, sorted_x + N, val) - sorted_x;\n    delete[] sorted_x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    float *d_x = (float *)x;\n    size_t rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (d_x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float value = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < value) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; i++) {\n      if (x[i] < val) {\n        rank++;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < N) {\n    ranks[idx] = 0;\n    for (int i = 0; i < N; ++i) {\n      if (x[i] < x[idx]) {\n        ranks[idx]++;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n",
+            "\n\n  // Get the index of the current thread\n  int tid = threadIdx.x;\n\n  // Compute the rank of the current element\n  float current_element = x[tid];\n  size_t rank = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] < current_element) {\n      rank++;\n    }\n  }\n\n  // Store the rank in the output vector\n  ranks[tid] = rank;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    float val = x[tid];\n    int rank = 0;\n    for (size_t i = 0; i < N; ++i) {\n      if (x[i] < val) {\n        ++rank;\n      }\n    }\n    ranks[tid] = rank;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "42_sort_sorted_ranks",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* For each value in the vector x compute its index in the sorted vector.\n   Store the results in `ranks`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every process has a complete copy of x. Store the result in ranks on process 0.\n   Examples:\n\n   input: [3.1, 2.8, 9.1, 0.4, 3.14]\n   output: [2, 1, 4, 0, 3]\n \n   input: [100, 7.6, 16.1, 18, 7.6]\n   output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector<float> const& x, std::vector<size_t> &ranks) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks on each process\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    // Sort the local copy of x\n    std::vector<float> local_x = x;\n    std::sort(local_x.begin(), local_x.end());\n\n    // Find the rank of x[i] in the sorted vector\n    auto it = std::lower_bound(local_x.begin(), local_x.end(), x[i]);\n    local_ranks[i] = std::distance(local_x.begin(), it);\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG,\n               ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  size_t n = x.size();\n\n  // Create a vector to store the ranks\n  ranks.resize(n);\n\n  // Sort the vector\n  std::sort(x.begin(), x.end());\n\n  // Find the rank of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; i++) {\n    ranks[i] = std::distance(x.begin(), std::find(x.begin(), x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (ranks.size() > 0) {\n    MPI_Gather(&ranks[0], ranks.size(), MPI_UNSIGNED_LONG,\n               &ranks[0], ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n\n  std::vector<float> local_x(x.begin() + my_rank * x.size() / world_size,\n                             x.begin() + (my_rank + 1) * x.size() / world_size);\n  std::vector<size_t> local_ranks(local_x.size());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    for (size_t j = 0; j < local_x.size(); ++j) {\n      if (local_x[i] < local_x[j]) {\n        ++local_ranks[i];\n      }\n    }\n  }\n\n  MPI_Reduce(local_ranks.data(), ranks.data(), local_ranks.size(), MPI_SIZE_T, MPI_SUM, 0,\n             MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MAX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Find the ranks of each element in the sorted vector\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    ranks[i] = std::distance(sorted_x.begin(), std::find(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<size_t> recv_ranks;\n      MPI_Recv(&recv_ranks[0], x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), recv_ranks.begin(), recv_ranks.end());\n    }\n  } else {\n    MPI_Send(&ranks[0], x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> local_x(x.begin() + rank * x.size() / size, x.begin() + (rank + 1) * x.size() / size);\n  std::sort(local_x.begin(), local_x.end());\n\n  std::vector<size_t> local_ranks(local_x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::lower_bound(x.begin(), x.end(), local_x[i]));\n  }\n\n  MPI_Gather(local_ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), local_ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a copy of x on each process\n  std::vector<float> local_x = x;\n\n  // Sort the local copy of x\n  std::sort(local_x.begin(), local_x.end());\n\n  // Create a vector to store the ranks\n  std::vector<size_t> local_ranks(x.size());\n\n  // Compute the ranks of the elements in the local copy of x\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(local_x.begin(), std::find(local_x.begin(), local_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes\n  MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n\n  // Only process 0 has the complete list of ranks\n  if (rank == 0) {\n    ranks = std::move(ranks);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]) - sorted_x.begin();\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (rank == 0) {\n    MPI_Gather(MPI_IN_PLACE, ranks.size(), MPI_UNSIGNED_LONG, ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), ranks.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  ranks.resize(n);\n  #pragma omp parallel for\n  for (int i = 0; i < n; ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n  if (my_rank == 0) {\n    MPI_Reduce(MPI_IN_PLACE, ranks.data(), n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Reduce(ranks.data(), nullptr, n, MPI_SIZE_T, MPI_MIN, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each process\n  std::vector<float> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Compute the ranks on each process\n  std::vector<size_t> local_ranks(x.size());\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    local_ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  // Gather the ranks from all processes on process 0\n  if (rank == 0) {\n    ranks.resize(x.size());\n    MPI_Gather(local_ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  std::vector<size_t> local_ranks(x.size());\n  #pragma opm parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  if (my_rank == 0) {\n    ranks = local_ranks;\n    for (int i = 1; i < num_procs; ++i) {\n      std::vector<size_t> recv_ranks(x.size());\n      MPI_Status status;\n      MPI_Probe(i, 0, MPI_COMM_WORLD, &status);\n      MPI_Get_count(&status, MPI_BYTE, &ranks.size());\n      MPI_Send(&local_ranks, ranks.size(), MPI_BYTE, i, 0, MPI_COMM_WORLD);\n    }\n  } else {\n    MPI_Send(&local_ranks, local_ranks.size(), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t n = x.size();\n  std::vector<size_t> local_ranks(n);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < n; ++i) {\n    local_ranks[i] = std::distance(x.begin(), std::min_element(x.begin(), x.end()));\n  }\n\n  MPI_Gather(local_ranks.data(), n, MPI_UNSIGNED_LONG, ranks.data(), n, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int world_size;\n  MPI_Comm_size(MPI_COMM_WORLD, &world_size);\n  int local_size = x.size() / world_size;\n  int start = my_rank * local_size;\n  int end = start + local_size;\n  if (my_rank == world_size - 1) {\n    end = x.size();\n  }\n  std::vector<size_t> local_ranks(local_size);\n  std::vector<float> local_x(local_size);\n  std::copy(x.begin() + start, x.begin() + end, local_x.begin());\n  std::sort(local_x.begin(), local_x.end());\n  #pragma omp parallel for\n  for (int i = 0; i < local_size; i++) {\n    local_ranks[i] = std::distance(local_x.begin(), std::lower_bound(local_x.begin(), local_x.end(), x[start + i]));\n  }\n  if (my_rank == 0) {\n    ranks.resize(x.size());\n  }\n  MPI_Gather(local_ranks.data(), local_size, MPI_UNSIGNED_LONG, ranks.data(), local_size, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  std::vector<float> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks[i] = std::distance(sorted_x.begin(), std::lower_bound(sorted_x.begin(), sorted_x.end(), x[i]));\n  }\n\n  if (my_rank == 0) {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, ranks.data(), x.size(), MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(ranks.data(), x.size(), MPI_UNSIGNED_LONG, nullptr, 0, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "(",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n\n  // Each process sorts its own copy of x\n  std::vector<float> x_sorted = x;\n  std::sort(x_sorted.begin(), x_sorted.end());\n\n  // Each process computes the ranks of its own elements\n  std::vector<size_t> ranks_local(x.size());\n  for (size_t i = 0; i < x.size(); ++i) {\n    ranks_local[i] = std::distance(x_sorted.begin(), std::lower_bound(x_sorted.begin(), x_sorted.end(), x[i]));\n  }\n\n  // Process 0 gathers the ranks from all processes\n  if (my_rank == 0) {\n    ranks = ranks_local;\n    for (int i = 1; i < MPI_COMM_WORLD->size; ++i) {\n      std::vector<size_t> ranks_recv(x.size());\n      MPI_Recv(ranks_recv.data(), x.size(), MPI_UNSIGNED_LONG, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      ranks.insert(ranks.end(), ranks_recv.begin(), ranks_recv.end());\n    }\n  } else {\n    MPI_Send(ranks_local.data(), x.size(), MPI_UNSIGNED_LONG, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(Kokkos::View<int*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  using namespace Kokkos;\n\n  // Create a view of the input array that ignores zero valued elements\n  auto x_nz = view(\"x_nz\", x, LayoutRight(), {x.extent(0), 0});\n  // Sort the non-zero elements in ascending order\n  parallel_sort(x_nz.extent(0), KOKKOS_LAMBDA(const int i) {\n    x_nz(i) = x(i);\n  });\n  // Copy the sorted non-zero elements back to the input array\n  parallel_for(x.extent(0), KOKKOS_LAMBDA(const int i) {\n    x(i) = x_nz(i);\n  });\n}\n",
+            "\n\n  auto exec_space = x.execution_space();\n  auto view_size = x.extent(0);\n  auto lambda = KOKKOS_LAMBDA(const int& i) {\n    return x(i) != 0;\n  };\n  auto mask = Kokkos::create_mask(exec_space, view_size, lambda);\n  auto sorted = Kokkos::sort(exec_space, mask, x);\n  Kokkos::deep_copy(x, sorted);\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"init_mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  int non_zero_count = 0;\n  Kokkos::parallel_for(\"shift_non_zero\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      x(non_zero_count++) = x(i);\n    }\n  });\n\n  // Fill the remaining elements with zeros\n  Kokkos::parallel_for(\"fill_zeros\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(non_zero_count, x.extent(0)), KOKKOS_LAMBDA(const int i) {\n    x(i) = 0;\n  });\n}\n",
+            "\n\n  // Create a mask to identify non-zero elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask_init\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the non-zero elements\n  Kokkos::sort(mask, x);\n\n  // Shift the non-zero elements to the beginning of the array\n  Kokkos::parallel_for(\"shift_non_zero\", x.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (mask(i)) {\n      Kokkos::atomic_fetch_add(&x(0), x(i));\n      x(i) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = std::numeric_limits<int>::max();\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == std::numeric_limits<int>::max()) {\n      x_i = 0;\n    }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  auto policy = Kokkos::RangePolicy<>(0, x.extent(0));\n  auto lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      x(i) = x(i) < 0 ? -x(i) : x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n  parallel_sort(policy, x.data(), x.data() + x.extent(0));\n  lambda = KOKKOS_LAMBDA(const int &i) {\n    if (x(i) < 0) {\n      x(i) = -x(i);\n    }\n  };\n  parallel_for(\"sortIgnoreZero\", policy, lambda);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"sortIgnoreZero\", x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int> y = x.filter([](int x) { return x != 0; });\n  Kok\u30b3\u30af::sort(y);\n  Kok\u30b3\u30af::View<int> z = x.map([](int x) { return x == 0 ? 0 : y[0]; });\n  y.copy_to(z);\n}\n",
+            "\n\n  // Create a mask to ignore zero valued elements\n  Kokkos::View<bool*> mask(\"mask\", x.extent(0));\n  Kokkos::parallel_for(\"mask\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(const int &i) {\n    mask(i) = (x(i) != 0);\n  });\n\n  // Sort the array using the mask\n  Kokkos::sort(mask, x);\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(x.extent(0), KOKKOS_LAMBDA(const int &i) {\n    if (x(i) != 0) {\n      int j = i + 1;\n      while (j < x.extent(0) && x(j) == 0) {\n        j++;\n      }\n      if (j < x.extent(0)) {\n        std::swap(x(i), x(j));\n      }\n    }\n  });\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i, int &count) {\n    if (x(i) != 0) { count++; }\n  });\n\n  // Create a temporary array to store the non-zero elements\n  Kokkos::View<int *> y(\"y\", count);\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { y(j++) = x(i); }\n  });\n\n  // Sort the temporary array\n  Kokkos::sort(y);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n    if (x(i) != 0) { x(i) = y(j++); }\n  });\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<int>;\n  using exec_space = Kokkos::DefaultExecutionSpace;\n  using view_t = Kokkos::View<int*, exec_space>;\n\n  auto x_nonzero = view_t(\"x_nonzero\", x.extent(0));\n  auto x_nonzero_count = 0;\n\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x_nonzero(x_nonzero_count++) = x(i);\n        }\n      });\n\n  Kokkos::sort(x_nonzero, x_nonzero_count);\n\n  x_nonzero_count = 0;\n  Kokkos::parallel_for(\n      \"sortIgnoreZero\", policy_t(0, x.extent(0)), KOKKOS_LAMBDA(int i) {\n        if (x(i) != 0) {\n          x(i) = x_nonzero(x_nonzero_count++);\n        }\n      });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int& i, int& x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i != 0) {\n      x_i = x(i);\n    }\n  });\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, int &x_i) {\n    if (x_i == 0) {\n      x_i = x(i);\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use OpenMP to sort x in parallel. \n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(non_zero.begin(), non_zero.end());\n    }\n  }\n\n  // Insert the non-zero elements back into x\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            ".",
+            "\n\n  std::vector<int> temp;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      temp.push_back(x[i]);\n    }\n  }\n  std::sort(temp.begin(), temp.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(n);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> nonzero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(nonzero.begin(), nonzero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = nonzero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[index++];\n    }\n  }\n}\n",
+            "\n\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count non-zero elements\n  int n = 0;\n  #pragma omp parallel for reduction(+:n)\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      n++;\n    }\n  }\n\n  // Create a vector to store non-zero elements\n  std::vector<int> y(n);\n\n  // Copy non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (size_t i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      ++count;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      y[j++] = x[i];\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    if (x[i] != 0) {\n      x[i] = y[j++];\n    }\n  }\n}\n",
+            "\n\n  // Create a vector to store the non-zero elements\n  std::vector<int> non_zero;\n\n  // Iterate over the input vector and copy non-zero elements to the new vector\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Copy the sorted non-zero elements back to the input vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place.\n   Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return a != 0 && (b == 0 || a < b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a != 0 && b != 0) ? a < b : a == 0;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": false,
+                        "is_valid": null,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::stable_partition(x.begin(), x.end(),\n                       [](int a) { return a != 0; });\n  std::sort(x.begin(), x.end());\n\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    return (a == 0 && b != 0) || (a != 0 && b == 0) ? a > b : a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(),\n            [](int a, int b) { return a != 0 && (a < b || b == 0); });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) {\n      return false;\n    } else if (b == 0) {\n      return true;\n    } else {\n      return a < b;\n    }\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](int a, int b) {\n    if (a == 0) return false;\n    if (b == 0) return true;\n    return a < b;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> y;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) { y.push_back(x[i]); }\n  }\n  std::sort(y.begin(), y.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.002046486,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020339318
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> sorted;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted.push_back(x[i]);\n    }\n  }\n  std::sort(sorted.begin(), sorted.end());\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted[j];\n      j++;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0020212144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0020248974
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use AMD HIP to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int temp = x[tid];\n    int i = tid - 1;\n    while (i >= 0 && x[i] > temp) {\n      x[i + 1] = x[i];\n      i--;\n    }\n    x[i + 1] = temp;\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // sort non-zero elements\n  __shared__ int sdata[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int gid = blockIdx.x * blockDim.x + threadIdx.x;\n  int stride = blockDim.x * gridDim.x;\n  int offset = bid * stride;\n  int count = 0;\n  int val = x[gid];\n  if (val != 0) {\n    sdata[tid] = val;\n    count++;\n  }\n  __syncthreads();\n  int n = count;\n  int i = tid;\n  while (i < n) {\n    sdata[i] = sdata[tid];\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    int j = i + 1;\n    while (j < n) {\n      if (sdata[i] > sdata[j]) {\n        int tmp = sdata[i];\n        sdata[i] = sdata[j];\n        sdata[j] = tmp;\n      }\n      j += stride;\n    }\n    i += stride;\n  }\n  __syncthreads();\n  i = tid;\n  while (i < n) {\n    x[offset + i] = sdata[i];\n    i += stride;\n  }\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create a temporary array to store the non-zero elements\n  int *temp = new int[num_non_zero];\n\n  // Copy the non-zero elements to the temporary array\n  int j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      temp[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the temporary array\n  std::sort(temp, temp + num_non_zero);\n\n  // Copy the sorted non-zero elements back to the original array\n  j = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = temp[j];\n      j++;\n    }\n  }\n\n  // Delete the temporary array\n  delete[] temp;\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint count = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tcount++;\n\t\t}\n\t}\n\n\t// Create an array to store the non-zero elements\n\tint *y = new int[count];\n\n\t// Copy the non-zero elements to the new array\n\tint j = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\ty[j] = x[i];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tstd::sort(y, y + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tj = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = y[j];\n\t\t\tj++;\n\t\t}\n\t}\n\n\t// Delete the temporary array\n\tdelete[] y;\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int temp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = temp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\tint j = tid + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n  if (i < N && x[i] != 0) {\n    int j = i + 1;\n    while (j < N && x[j] == 0) {\n      j++;\n    }\n    if (j < N) {\n      int temp = x[i];\n      x[i] = x[j];\n      x[j] = temp;\n    }\n  }\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i - 1] > x[i]) {\n      int tmp = x[i - 1];\n      x[i - 1] = x[i];\n      x[i] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    int i = tid;\n    while (i > 0 && x[i] < x[i - 1]) {\n      int tmp = x[i];\n      x[i] = x[i - 1];\n      x[i - 1] = tmp;\n      i--;\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N && x[tid] != 0) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] != 0 && x[i] < x[tid]) {\n        int tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the non-zero elements\n  thrust::sort(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::greater<int>());\n\n  // Count the number of non-zero elements\n  int num_non_zero = thrust::count(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + N), thrust::not_equal_to<int>(0));\n\n  // Move the non-zero elements to the beginning of the array\n  thrust::copy(thrust::device_ptr<int>(x), thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x));\n\n  // Fill the remaining elements with zeros\n  thrust::fill(thrust::device_ptr<int>(x + num_non_zero), thrust::device_ptr<int>(x + N), 0);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (x[tid] != 0) {\n      int i = tid;\n      while (i > 0 && x[i] < x[i - 1]) {\n        int tmp = x[i];\n        x[i] = x[i - 1];\n        x[i - 1] = tmp;\n        i--;\n      }\n    }\n  }\n}\n",
+            "\n\n\tint tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tint temp = x[i];\n\t\t\t\tx[i] = x[i - 1];\n\t\t\t\tx[i - 1] = temp;\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the array in ascending order ignoring elements with value 0.\n\t// Leave zero valued elements in-place.\n\tint my_rank = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n\tif (my_rank < N && x[my_rank] != 0) {\n\t\tfor (int i = my_rank + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[my_rank]) {\n\t\t\t\tint temp = x[my_rank];\n\t\t\t\tx[my_rank] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  // Gather all non-zero elements to rank 0\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(nonZero.data(), nonZero.size(), MPI_INT, x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Get the size of the communicator\n  int size;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the sorted elements\n  std::vector<int> sorted_x;\n\n  // Gather all the non-zero elements from all the ranks\n  std::vector<int> non_zero_elements;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_elements.begin(), non_zero_elements.end());\n\n  // Scatter the sorted non-zero elements to all the ranks\n  MPI_Scatter(non_zero_elements.data(), non_zero_elements.size() / size, MPI_INT,\n              sorted_x.data(), non_zero_elements.size() / size, MPI_INT, 0,\n              MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sorted_x[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size() * size);\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  if (rank != 0) {\n    non_zero.resize(sorted_non_zero.size() / size);\n  }\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into the original vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> y(n);\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      y[count] = x[i];\n      count++;\n    }\n  }\n  std::sort(y.begin(), y.begin() + count);\n  int index = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = y[index];\n      index++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(non_zero_elements.data(), count / size, MPI_INT, x.data(), count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  for (int i = count / size; i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  std::vector<int> x_non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x_non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(x_non_zero.begin(), x_non_zero.end());\n  int count = x_non_zero.size();\n  MPI_Gather(&count, 1, MPI_INT, NULL, 0, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    int total_count = 0;\n    for (int i = 0; i < size; i++) {\n      int count_i;\n      MPI_Gather(&count_i, 1, MPI_INT, &count_i, 1, MPI_INT, i, MPI_COMM_WORLD);\n      total_count += count_i;\n    }\n    x.resize(total_count);\n  }\n  MPI_Gatherv(x_non_zero.data(), x_non_zero.size(), MPI_INT, x.data(), NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to the new vector\n  int j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[j] = x[i];\n      j++;\n    }\n  }\n\n  // Sort the new vector\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to the original vector\n  j = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = y[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  if (rank == 0) {\n    sortedNonZero.resize(nonZero.size() * size);\n  }\n  MPI_Gather(nonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  if (rank != 0) {\n    sortedNonZero.resize(nonZero.size());\n  }\n  MPI_Scatter(sortedNonZero.data(), nonZero.size(), MPI_INT, sortedNonZero.data(), nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the sorted non-zero elements into x\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      x[i] = sortedNonZero[index++];\n    }\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Insert the non-zero elements back into the original vector\n  int j = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero[j];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> y(count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &y[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements\n  if (rank == 0) {\n    std::sort(y.begin(), y.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&y[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  if (rank == 0) {\n    sorted_non_zero.resize(non_zero.size());\n  }\n  MPI_Gather(non_zero.data(), non_zero.size(), MPI_INT, sorted_non_zero.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(sorted_non_zero.data(), non_zero.size(), MPI_INT, x.data(), non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with zeros\n  for (int i = non_zero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> nonZero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      nonZero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(nonZero.begin(), nonZero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sortedNonZero;\n  MPI_Gather(&nonZero[0], nonZero.size(), MPI_INT, &sortedNonZero[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(&sortedNonZero[0], nonZero.size(), MPI_INT, &x[0], nonZero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  for (int i = nonZero.size(); i < x.size(); i++) {\n    x[i] = 0;\n  }\n}\n",
+            "\n\n  int myrank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (myrank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0,\n             MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero_x.begin(), non_zero_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(non_zero_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Create a new vector to store the non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Gather the sorted non-zero elements on rank 0\n  std::vector<int> sorted_non_zero;\n  MPI_Gather(&non_zero[0], non_zero.size(), MPI_INT, &sorted_non_zero[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&sorted_non_zero[0], non_zero.size(), MPI_INT, &x[0], non_zero.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Insert the zero elements back into the vector\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = sorted_non_zero[index++];\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the array x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use CUDA to compute in parallel. The kernel will be launched with 1 thread per element.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\n__global__ void sortIgnoreZero(int *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint non_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[non_zero_index] = x[i];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tnon_zero_index = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[non_zero_index];\n\t\t\tnon_zero_index++;\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid >= N) return;\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[tid] > x[i]) {\n\t\t\t\tint tmp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = tmp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = tmp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            ".",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tx[tid] = x[tid] > 0 ? x[tid] : -x[tid];\n\t}\n}\n",
+            "\n\n\t// Each thread sorts its own element\n\tif (x[threadIdx.x] != 0) {\n\t\tfor (int i = threadIdx.x + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[threadIdx.x] > x[i]) {\n\t\t\t\tint temp = x[threadIdx.x];\n\t\t\t\tx[threadIdx.x] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Find the number of non-zero elements\n\tint num_non_zero = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnum_non_zero++;\n\t\t}\n\t}\n\n\t// Allocate memory for the non-zero elements\n\tint *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n\t// Copy the non-zero elements to the new array\n\tint idx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tnon_zero_elements[idx++] = x[i];\n\t\t}\n\t}\n\n\t// Sort the non-zero elements\n\tthrust::sort(non_zero_elements, non_zero_elements + num_non_zero);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tidx = 0;\n\tfor (size_t i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = non_zero_elements[idx++];\n\t\t}\n\t}\n\n\t// Free the memory allocated for the non-zero elements\n\tfree(non_zero_elements);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint i = bid * blockDim.x + tid;\n\n\tif (i < N && x[i] != 0) {\n\t\tint j = i + 1;\n\t\twhile (j < N && x[j] == 0) {\n\t\t\tj++;\n\t\t}\n\t\tif (j < N) {\n\t\t\tint tmp = x[i];\n\t\t\tx[i] = x[j];\n\t\t\tx[j] = tmp;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x + blockIdx.x * blockDim.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i - 1] > x[i]) {\n\t\t\tint temp = x[i - 1];\n\t\t\tx[i - 1] = x[i];\n\t\t\tx[i] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "Ignore the zero value in the list.",
+            "\n\n\t// Get the thread index\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n\t// If the element is not zero, sort it\n\tif (x[tid] != 0) {\n\t\tfor (int i = tid + 1; i < N; i++) {\n\t\t\tif (x[i] != 0 && x[i] < x[tid]) {\n\t\t\t\tint temp = x[tid];\n\t\t\t\tx[tid] = x[i];\n\t\t\t\tx[i] = temp;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint bid = blockIdx.x;\n\tint bid_size = blockDim.x;\n\tint start = bid * bid_size + tid;\n\tint end = start + bid_size;\n\tif (start >= N) return;\n\tif (end > N) end = N;\n\tint count = 0;\n\tfor (int i = start; i < end; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[count++] = x[i];\n\t\t}\n\t}\n\tfor (int i = count; i < end; i++) {\n\t\tx[i] = 0;\n\t}\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tif (x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tswap(x[i], x[i - 1]);\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N && x[tid] != 0) {\n\t\tint i = tid;\n\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\tint temp = x[i];\n\t\t\tx[i] = x[i - 1];\n\t\t\tx[i - 1] = temp;\n\t\t\ti--;\n\t\t}\n\t}\n}\n",
+            "\n\n\t// Sort the non-zero elements\n\tint *d_sorted = (int *)malloc(N * sizeof(int));\n\tint count = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\td_sorted[count] = x[i];\n\t\t\tcount++;\n\t\t}\n\t}\n\tthrust::sort(d_sorted, d_sorted + count);\n\n\t// Copy the sorted non-zero elements back to the original array\n\tcount = 0;\n\tfor (int i = 0; i < N; i++) {\n\t\tif (x[i] != 0) {\n\t\t\tx[i] = d_sorted[count];\n\t\t\tcount++;\n\t\t}\n\t}\n\tfree(d_sorted);\n}\n",
+            "\n\n\tint tid = threadIdx.x;\n\tint val = x[tid];\n\tif (val == 0) {\n\t\treturn;\n\t}\n\tint i = tid;\n\twhile (i > 0 && x[i - 1] > val) {\n\t\tx[i] = x[i - 1];\n\t\ti--;\n\t}\n\tx[i] = val;\n}\n",
+            "\n\n\tint tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (tid < N) {\n\t\tif (x[tid] != 0) {\n\t\t\tint i = tid;\n\t\t\twhile (i > 0 && x[i] < x[i - 1]) {\n\t\t\t\tswap(x[i], x[i - 1]);\n\t\t\t\ti--;\n\t\t\t}\n\t\t}\n\t}\n}\n",
+            "\n\n  // Find the number of non-zero elements\n  int num_non_zero = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      num_non_zero++;\n    }\n  }\n\n  // Create an array to store the non-zero elements\n  int *non_zero_elements = (int *)malloc(num_non_zero * sizeof(int));\n\n  // Copy the non-zero elements to the new array\n  int index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      non_zero_elements[index] = x[i];\n      index++;\n    }\n  }\n\n  // Sort the non-zero elements\n  qsort(non_zero_elements, num_non_zero, sizeof(int), compare);\n\n  // Copy the sorted non-zero elements back to the original array\n  index = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (x[i] != 0) {\n      x[i] = non_zero_elements[index];\n      index++;\n    }\n  }\n\n  // Free the memory allocated for the non-zero elements array\n  free(non_zero_elements);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "44_sort_sort_non-zero_elements",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x in ascending order ignoring elements with value 0.\n   Leave zero valued elements in-place. \n\t Use MPI and OpenMP to sort x in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n\n   input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n   output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector<int> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the number of processes\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Get the rank of the current process\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Create a vector to store the sorted values\n  std::vector<int> sorted(x.size());\n\n  // Gather the sorted values from each rank\n  MPI_Gather(x.data(), x.size(), MPI_INT, sorted.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If the current rank is 0, store the sorted values in x\n  if (rank == 0) {\n    x = sorted;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort x on each rank\n  std::sort(x.begin(), x.end());\n\n  // Gather the sorted x from all ranks to rank 0\n  std::vector<int> gathered_x;\n  if (rank == 0) {\n    gathered_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_INT, gathered_x.data(), x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered x on rank 0\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end());\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Create a vector of non-zero elements\n  std::vector<int> non_zero;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements to all ranks\n  MPI_Scatter(non_zero.data(), non_zero.size() / nprocs, MPI_INT,\n              x.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Gather the sorted non-zero elements from all ranks to rank 0\n  MPI_Gather(x.data(), non_zero.size() / nprocs, MPI_INT,\n              non_zero.data(), non_zero.size() / nprocs, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // If rank is 0, insert the sorted non-zero elements into x\n  if (rank == 0) {\n    int index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[i] = non_zero[index++];\n    }\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (my_rank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  int n = x.size();\n  std::vector<int> x_local(n);\n  std::vector<int> x_sorted(n);\n  std::copy(x.begin(), x.end(), x_local.begin());\n  std::sort(x_local.begin(), x_local.end(), [](int a, int b) { return a != 0 && (a < b); });\n  MPI_Gather(x_local.data(), n, MPI_INT, x_sorted.data(), n, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    x = x_sorted;\n  }\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int n = x.size();\n  std::vector<int> non_zero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      non_zero.push_back(x[i]);\n    }\n  }\n  std::sort(non_zero.begin(), non_zero.end());\n  int non_zero_size = non_zero.size();\n  int recv_count[MPI_COMM_SIZE];\n  int recv_displs[MPI_COMM_SIZE];\n  MPI_Gather(&non_zero_size, 1, MPI_INT, recv_count, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    recv_displs[0] = 0;\n    for (int i = 1; i < MPI_COMM_SIZE; i++) {\n      recv_displs[i] = recv_displs[i - 1] + recv_count[i - 1];\n    }\n  }\n  MPI_Gatherv(non_zero.data(), non_zero_size, MPI_INT, x.data(), recv_count, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);\n  if (my_rank == 0) {\n    int j = 0;\n    for (int i = 0; i < n; i++) {\n      if (x[i] == 0) {\n        continue;\n      }\n      x[j] = x[i];\n      j++;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Allocate memory for the non-zero elements\n  std::vector<int> y(count);\n\n  // Copy the non-zero elements to y\n  int index = 0;\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      y[index++] = x[i];\n    }\n  }\n\n  // Sort y\n  std::sort(y.begin(), y.end());\n\n  // Copy the sorted non-zero elements back to x on rank 0\n  if (rank == 0) {\n    index = 0;\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] != 0) {\n        x[i] = y[index++];\n      }\n    }\n  }\n}\n",
+            "\n\n  int myrank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);\n  int n = x.size();\n  std::vector<int> x_nonzero;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x_nonzero.push_back(x[i]);\n    }\n  }\n  std::sort(x_nonzero.begin(), x_nonzero.end());\n  int count = 0;\n  for (int i = 0; i < n; i++) {\n    if (x[i] != 0) {\n      x[i] = x_nonzero[count];\n      count++;\n    }\n  }\n  if (myrank == 0) {\n    x.resize(n);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the gathered non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> nonZero;\n  if (rank == 0) {\n    nonZero.resize(count);\n  }\n  MPI_Gather(&x[0], x.size(), MPI_INT, &nonZero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(nonZero.begin(), nonZero.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&nonZero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements to rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Allocate memory for the sorted array\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  MPI_Gather(&x[0], x.size(), MPI_INT, &sorted_x[0], global_count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered array on rank 0\n  if (my_rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end());\n  }\n\n  // Scatter the sorted array back to all ranks\n  MPI_Scatter(&sorted_x[0], global_count / nprocs, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count = 0;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted_x(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted_x[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted_x.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 fills in the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Create a vector of size x.size() to store the indices of the sorted values\n  std::vector<int> sorted_indices(x.size());\n\n  // Sort the indices of x in ascending order\n  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);\n  std::sort(sorted_indices.begin(), sorted_indices.end(),\n            [&x](int i, int j) { return x[i] < x[j]; });\n\n  // Create a vector of size x.size() to store the sorted values\n  std::vector<int> sorted_x(x.size());\n\n  // Copy the sorted values from x into sorted_x\n  for (int i = 0; i < x.size(); i++) {\n    sorted_x[i] = x[sorted_indices[i]];\n  }\n\n  // Broadcast the sorted values to all ranks\n  MPI_Bcast(&sorted_x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted values into x on rank 0\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      count++;\n    }\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero_elements(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero_elements[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) {\n    std::sort(non_zero_elements.begin(), non_zero_elements.end());\n  }\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero_elements[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the zero elements to their original positions\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) {\n      x[i] = i;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int local_count = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      local_count++;\n    }\n  }\n\n  // Gather the counts from all ranks\n  int global_count;\n  MPI_Allreduce(&local_count, &global_count, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);\n\n  // Create a new vector to store the sorted non-zero elements\n  std::vector<int> sorted(global_count);\n\n  // Gather the non-zero elements from all ranks\n  int index = 0;\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) {\n      sorted[index++] = x[i];\n    }\n  }\n\n  // Sort the non-zero elements\n  std::sort(sorted.begin(), sorted.end());\n\n  // Scatter the sorted non-zero elements to rank 0\n  MPI_Scatter(sorted.data(), global_count / size, MPI_INT, x.data(), global_count / size, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Fill the remaining elements with 0\n  if (rank == 0) {\n    for (int i = global_count; i < x.size(); i++) {\n      x[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Count the number of non-zero elements\n  int count = 0;\n  #pragma omp parallel for reduction(+:count)\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] != 0) count++;\n  }\n\n  // Gather the non-zero elements on rank 0\n  std::vector<int> non_zero(count);\n  MPI_Gather(&x[0], x.size(), MPI_INT, &non_zero[0], count, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Sort the non-zero elements on rank 0\n  if (rank == 0) std::sort(non_zero.begin(), non_zero.end());\n\n  // Scatter the sorted non-zero elements back to all ranks\n  MPI_Scatter(&non_zero[0], count, MPI_INT, &x[0], x.size(), MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Set the remaining elements to 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == 0) x[i] = -1;\n  }\n\n  // Sort the elements in-place on each rank\n  std::sort(x.begin(), x.end());\n\n  // Replace -1 with 0\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    if (x[i] == -1) x[i] = 0;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Find the k-th smallest element of the vector x.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(Kokkos::View<const int*> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::View<int*> sorted_x(\"sorted_x\", x.size());\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(const int& i) {\n    sorted_x(i) = x(i);\n  });\n  Kokkos::sort(sorted_x.data(), sorted_x.data() + sorted_x.size());\n  return sorted_x(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  auto x_view = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view of the output vector\n  auto y_view = Kokkos::create_mirror_view<int>(1);\n\n  // Create a Kokkos kernel to find the k-th smallest element\n  Kokkos::parallel_reduce(\n      Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x_view.extent(0)),\n      KOKKOS_LAMBDA(const int i, int& y) {\n        if (i < k) {\n          y = std::min(y, x_view(i));\n        }\n      },\n      y_view);\n\n  // Copy the output vector back to the host\n  int y;\n  Kokkos::deep_copy(y, y_view);\n\n  return y;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  // Find the k-th smallest element using a parallel reduction.\n  int kth_smallest = parallel_reduce(\n      Policy(0, x.extent(0)),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::min(kth_smallest, x(i));\n        }\n      },\n      k);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy[i];\n    }\n  }, tag_type());\n\n  return x_copy[k - 1];\n}\n",
+            "\n\n  Kokkos::View<int*> y(\"y\", x.size());\n  Kokkos::parallel_for(\"copy\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, x.size()), KOKKOS_LAMBDA(int i) {\n    y(i) = x(i);\n  });\n  Kokkos::sort(y);\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n  using l_d_type = Kokkos::LayoutLeft;\n  using l_d_view_type = Kokkos::View<int*, l_d_type>;\n\n  const int n = x.extent(0);\n  l_d_view_type y(\"y\", n);\n\n  parallel_for(policy_type(0, n), KOKKOS_LAMBDA(const int& i) { y(i) = x(i); }, tag_type());\n\n  Kokkos::sort(y);\n\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  auto x_copy = Kokkos::View<int*, Kokkos::HostSpace>(n);\n  Kokkos::deep_copy(x_copy, x);\n\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& result) {\n    if (i == k - 1) {\n      result = x_copy(i);\n    }\n  }, tag_type());\n\n  return x_copy(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k cannot be greater than the size of the vector\");\n  }\n\n  auto kth_smallest = parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth) {\n        if (i < k) {\n          kth = std::min(kth, x(i));\n        }\n      },\n      std::numeric_limits<int>::max());\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using Policy = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using Tag = Kokkos::Tag;\n\n  int n = x.extent(0);\n  if (k > n) {\n    throw std::invalid_argument(\"k must be less than or equal to the size of the vector\");\n  }\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(Policy(0, n), x, Tag());\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  Kok\u30b3\u30af::View<int*> y = Kok\u30b3\u30af::sort(x);\n  return y[k - 1];\n}\n",
+            "\n\n  // Create a Kokkos view for the input vector x\n  Kokkos::View<const int*, Kokkos::HostSpace> x_host = Kokkos::create_mirror_view(x);\n\n  // Create a Kokkos view for the output vector y\n  Kokkos::View<int*, Kokkos::HostSpace> y_host = Kokkos::create_mirror_view(Kokkos::View<int*, Kokkos::HostSpace>(1));\n\n  // Create a Kokkos parallel for loop to find the k-th smallest element\n  Kokkos::parallel_for(\"findKthSmallest\", Kokkos::RangePolicy<Kokkos::HostSpace>(0, x_host.size()), KOKKOS_LAMBDA(int i) {\n    // Sort the input vector x\n    std::sort(x_host.begin(), x_host.end());\n\n    // Get the k-th smallest element\n    y_host(0) = x_host(k - 1);\n  });\n\n  // Copy the output vector y back to the device\n  Kokkos::deep_copy(x, y_host);\n\n  // Return the k-th smallest element\n  return y_host(0);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(int i, int& kth_smallest) {\n        if (i == k - 1) {\n          kth_smallest = x(i);\n        }\n      },\n      kth_smallest, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a Kokkos view of the input vector\n  View<int*> y = Kokkos::create_mirror_view(x);\n\n  // Sort the vector in parallel\n  parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::Tag;\n\n  int n = x.extent(0);\n  int kth_smallest = 0;\n  Kokkos::parallel_reduce(\n      policy_type(0, n),\n      KOKKOS_LAMBDA(const int& i, int& kth_smallest) {\n        if (i < k) {\n          kth_smallest = std::max(kth_smallest, x(i));\n        }\n      },\n      kth_smallest);\n  return kth_smallest;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(int i, int j) { return x(i) < x(j); });\n\n  // Return the k-th smallest element\n  return x(k - 1);\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::Work;\n\n  const int n = static_cast<int>(x.extent(0));\n  Kokkos::View<int*> y(\"y\", n);\n\n  parallel_for(Work(n), KOKKOS_LAMBDA(int i) {\n    y[i] = x[i];\n  });\n\n  std::sort(y.data(), y.data() + n);\n\n  return y[k - 1];\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_t = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n\n  int n = x.extent(0);\n  int kth = 0;\n\n  // Sort the vector in parallel\n  Kokkos::parallel_sort(policy_t(0, n), x);\n\n  // Get the k-th smallest element\n  kth = x(k - 1);\n\n  return kth;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using tag_type = Kokkos::ScanTag;\n\n  int n = x.extent(0);\n  int kth_smallest = -1;\n  Kokkos::parallel_reduce(policy_type(0, n), KOKKOS_LAMBDA(int i, int& kth_smallest) {\n    if (i == k - 1) {\n      kth_smallest = x(i);\n    }\n  }, tag_type());\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n\n  // Create a parallel reduction to find the k-th smallest element\n  auto kth_smallest = reduce(x.extent(0), KOKKOS_LAMBDA(int i, int& min) {\n    if (i < k) {\n      min = std::min(min, x(i));\n    }\n  }, min);\n\n  return kth_smallest;\n}\n",
+            "\n\n  using namespace Kokkos;\n  using policy_type = Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>;\n  using view_type = Kokkos::View<int*, policy_type>;\n\n  // Create a copy of the input vector\n  view_type y = Kokkos::create_mirror_view(x);\n\n  // Sort the copy in parallel\n  Kokkos::parallel_sort(y.extent(0), KOKKOS_LAMBDA(int i, int j) { return y(i) < y(j); });\n\n  // Return the k-th smallest element\n  return y(k - 1);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use OpenMP to compute in parallel.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  int kth = 0;\n  #pragma omp parallel for reduction(min:kth)\n  for (int i = 0; i < n; i++) {\n    if (x[i] < kth || kth == 0) {\n      kth = x[i];\n    }\n  }\n  return kth;\n}\n",
+            "\n\n  // Sort the vector in parallel\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); ++i) {\n      for (int j = i + 1; j < x.size(); ++j) {\n        if (x[i] > x[j]) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; i++) {\n      for (int j = i + 1; j < n; j++) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  // Create a shared variable to store the k-th smallest element\n  int kth_smallest;\n\n  // Create a parallel region\n  #pragma omp parallel\n  {\n    // Each thread computes the k-th smallest element of its local subarray\n    int local_kth_smallest = x[0];\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth_smallest) {\n        local_kth_smallest = x[i];\n      }\n    }\n\n    // Update the shared variable with the minimum of the local k-th smallest elements\n    #pragma omp critical\n    {\n      if (local_kth_smallest < kth_smallest) {\n        kth_smallest = local_kth_smallest;\n      }\n    }\n  }\n\n  // Return the k-th smallest element\n  return kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < n; ++i) {\n      for (int j = i + 1; j < n; ++j) {\n        if (sorted_x[i] > sorted_x[j]) {\n          std::swap(sorted_x[i], sorted_x[j]);\n        }\n      }\n    }\n  }\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+            "\n\n  int n = x.size();\n  std::vector<int> sorted_x(x);\n  #pragma omp parallel\n  {\n    #pragma omp single\n    {\n      std::sort(sorted_x.begin(), sorted_x.end());\n    }\n  }\n  return sorted_x[k - 1];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "serial",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th element of the sorted vector.\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0075575038,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0075413016
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and n\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0075377786,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074782548
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  if (k < 1 || k > n) {\n    throw std::invalid_argument(\"k must be between 1 and the size of the vector.\");\n  }\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.00751407,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074996232
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::sort(x.begin(), x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "=4",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  // Sort the vector in ascending order.\n  std::sort(x.begin(), x.end());\n\n  // Return the k-th smallest element.\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::vector<int> sorted_x = x;\n  std::sort(sorted_x.begin(), sorted_x.end());\n  return sorted_x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0073718026,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0074739588
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  int n = x.size();\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  return x[k - 1];\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "hip",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int stride = blockDim.x;\n  while (stride < N) {\n    if (tid < N / stride) {\n      if (kthSmallest[tid] > kthSmallest[tid + stride]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[tid + stride];\n        kthSmallest[tid + stride] = tmp;\n      }\n    }\n    __syncthreads();\n    stride *= 2;\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    hipSort(kthSmallest, kthSmallest + N, hipSortAscending);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int local_min = x[tid];\n    for (int i = tid + blockDim.x; i < N; i += blockDim.x) {\n      local_min = min(local_min, x[i]);\n    }\n    atomicMin(kthSmallest, local_min);\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    if (tid == 0) {\n      *kthSmallest = x[tid];\n    } else {\n      if (x[tid] < *kthSmallest) {\n        *kthSmallest = x[tid];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int numBlocks = hipGridDim_x;\n  int numThreads = hipBlockDim_x;\n  int numElements = numBlocks * numThreads;\n  int numLevels = 0;\n  while (numElements > 1) {\n    numLevels++;\n    numElements = (numElements + numThreads - 1) / numThreads;\n  }\n  for (int level = 0; level < numLevels; level++) {\n    int stride = 1 << level;\n    int offset = tid & (stride - 1);\n    int other = tid ^ offset;\n    if (offset < other && other < numElements) {\n      if (kthSmallest[tid] > kthSmallest[other]) {\n        int tmp = kthSmallest[tid];\n        kthSmallest[tid] = kthSmallest[other];\n        kthSmallest[other] = tmp;\n      }\n    }\n    __syncthreads();\n  }\n  if (tid == 0) {\n    *kthSmallest = kthSmallest[0];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = blockId * blockDim + tid;\n  int end = min(start + blockDim, N);\n  int local_kthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < local_kthSmallest) {\n      local_kthSmallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kthSmallest);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    __shared__ int s[hipBlockDim_x];\n    s[tid] = x[tid];\n    __syncthreads();\n    if (tid == 0) {\n      hipSort(s, hipBlockDim_x);\n      *kthSmallest = s[k - 1];\n    }\n  }\n}\n",
+            "K",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  int *d_temp = new int[N];\n  for (int i = 0; i < N; i += 2) {\n    if (i + 1 < N) {\n      if (kthSmallest[i] > kthSmallest[i + 1]) {\n        d_temp[i / 2] = kthSmallest[i + 1];\n        d_temp[i / 2 + N / 2] = kthSmallest[i];\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n        d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n      }\n    } else {\n      d_temp[i / 2] = kthSmallest[i];\n    }\n  }\n  kthSmallest = d_temp;\n  __syncthreads();\n  if (N % 2 == 1) {\n    N = N / 2 + 1;\n  } else {\n    N = N / 2;\n  }\n  while (N > 1) {\n    for (int i = 0; i < N; i += 2) {\n      if (i + 1 < N) {\n        if (kthSmallest[i] > kthSmallest[i + 1]) {\n          d_temp[i / 2] = kthSmallest[i + 1];\n          d_temp[i / 2 + N / 2] = kthSmallest[i];\n        } else {\n          d_temp[i / 2] = kthSmallest[i];\n          d_temp[i / 2 + N / 2] = kthSmallest[i + 1];\n        }\n      } else {\n        d_temp[i / 2] = kthSmallest[i];\n      }\n    }\n    kthSmallest = d_temp;\n    __syncthreads();\n    if (N % 2 == 1) {\n      N = N / 2 + 1;\n    } else {\n      N = N / 2;\n    }\n  }\n  *kthSmallest = kthSmallest[0];\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    int local_kthSmallest = x[tid];\n    for (int i = tid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (x[i] < local_kthSmallest) {\n        local_kthSmallest = x[i];\n      }\n    }\n    __shared__ int shared_kthSmallest;\n    shared_kthSmallest = local_kthSmallest;\n    for (int i = hipBlockDim_x / 2; i > 0; i /= 2) {\n      __syncthreads();\n      if (hipThreadIdx_x < i) {\n        if (shared_kthSmallest > x[tid + i]) {\n          shared_kthSmallest = x[tid + i];\n        }\n      }\n    }\n    __syncthreads();\n    if (hipThreadIdx_x == 0) {\n      if (k == 0) {\n        *kthSmallest = shared_kthSmallest;\n      } else {\n        k--;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_ranks);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_kth = x[k - 1];\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  MPI_Allreduce(&local_kth, &kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return kth;\n  }\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth_smallest = x[0];\n  for (int i = 1; i < x.size(); i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  if (rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements to rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among all the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kthSmallest = x[k - 1];\n  MPI_Gather(&kthSmallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (myRank == 0) {\n    std::sort(kthSmallest, kthSmallest + numRanks);\n    return kthSmallest[k - 1];\n  }\n  return -1;\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            ".",
+            "\n\n  int n = x.size();\n  int my_rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;  // Return -1 on all other ranks\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  int n = x.size();\n  int local_k = k / num_procs;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n  if (my_rank == num_procs - 1) {\n    local_end = n;\n  }\n\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1,\n                   x.begin() + local_end);\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN,\n                MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(comm_size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (comm_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth = x[k - 1];\n\n  // Gather the k-th smallest elements from all ranks\n  std::vector<int> gathered_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, gathered_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::nth_element(gathered_kth.begin(), gathered_kth.begin() + k - 1, gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end());\n\n  // Find the k-th smallest element on each rank\n  int local_kth_smallest = x[k - 1];\n\n  // Gather the local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the k-th smallest element among the gathered elements on rank 0\n  if (rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local k-th smallest element\n  std::sort(x.begin(), x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> global_kth_smallest(num_procs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, global_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (my_rank == 0) {\n    std::sort(global_kth_smallest.begin(), global_kth_smallest.end());\n    return global_kth_smallest[k - 1];\n  } else {\n    return -1;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "cuda",
+        "prompt": "/* Find the k-th smallest element of the vector x.\n   Use CUDA to compute in parallel. The kernel is launched with at least as many threads as values in x.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\n__global__ void findKthSmallest(const int *x, size_t N, int k, int *kthSmallest) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int gid = bid * bid_size + tid;\n  int val = x[gid];\n  int count = 0;\n  for (int i = 0; i < N; i++) {\n    if (x[i] < val) {\n      count++;\n    }\n  }\n  if (count == k - 1) {\n    *kthSmallest = val;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockId = blockIdx.x;\n  int blockDim = blockDim.x;\n  int gridDim = gridDim.x;\n  int start = tid + blockId * blockDim;\n  int end = min(start + blockDim, N);\n  int localKthSmallest = x[start];\n  for (int i = start + 1; i < end; i++) {\n    if (x[i] < localKthSmallest) {\n      localKthSmallest = x[i];\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int globalKthSmallest = localKthSmallest;\n    for (int i = 1; i < gridDim; i++) {\n      int otherKthSmallest;\n      cudaMemcpy(&otherKthSmallest, &globalKthSmallest, sizeof(int), cudaMemcpyDeviceToDevice);\n      if (otherKthSmallest < globalKthSmallest) {\n        globalKthSmallest = otherKthSmallest;\n      }\n    }\n    *kthSmallest = globalKthSmallest;\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int i = blockIdx * blockDim + tid;\n  if (i < N) {\n    kthSmallest[tid] = x[i];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + blockDim);\n    if (blockIdx == 0) {\n      *kthSmallest = kthSmallest[k - 1];\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {\n      if (threadIdx.x < stride) {\n        shared[threadIdx.x] = min(shared[threadIdx.x], shared[threadIdx.x + stride]);\n      }\n      __syncthreads();\n    }\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x) {\n        *kthSmallest = shared[0];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 1; i < blockDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      temp = *kthSmallest;\n      *kthSmallest = min(*kthSmallest, x[i]);\n      x[i] = min(temp, x[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int idx = bid * bid_size + tid;\n  \n  // Find the k-th smallest element in the current block\n  int block_kth_smallest = x[idx];\n  for (int i = idx + bid_size; i < N; i += grid_size * bid_size) {\n    if (x[i] < block_kth_smallest) {\n      block_kth_smallest = x[i];\n    }\n  }\n  \n  // Find the k-th smallest element among all blocks\n  if (tid == 0) {\n    for (int i = 1; i < grid_size; i++) {\n      int other_block_kth_smallest;\n      cudaMemcpy(&other_block_kth_smallest, &x[i * bid_size], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (other_block_kth_smallest < block_kth_smallest) {\n        block_kth_smallest = other_block_kth_smallest;\n      }\n    }\n    *kthSmallest = block_kth_smallest;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < N; i++) {\n      if (x[i] < *kthSmallest) {\n        *kthSmallest = x[i];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    kthSmallest[0] = min(kthSmallest[0], x[tid]);\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < blockDim.x; i++) {\n      kthSmallest[0] = min(kthSmallest[0], kthSmallest[i]);\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int blockDim = blockDim.x;\n  int blockIdx = blockIdx.x;\n  int idx = blockIdx * blockDim + tid;\n  if (idx < N) {\n    if (tid == 0) {\n      *kthSmallest = x[idx];\n    } else {\n      if (x[idx] < *kthSmallest) {\n        *kthSmallest = x[idx];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    kthSmallest[0] = x[tid];\n  }\n  __syncthreads();\n  int numThreads = blockDim.x * gridDim.x;\n  while (numThreads > 1) {\n    if (tid < numThreads / 2) {\n      kthSmallest[tid] = min(kthSmallest[tid], kthSmallest[tid + numThreads / 2]);\n    }\n    __syncthreads();\n    numThreads /= 2;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    int val = x[tid];\n    int count = 0;\n    for (int i = 0; i < N; i++) {\n      if (x[i] < val) {\n        count++;\n      }\n    }\n    if (count == k - 1) {\n      *kthSmallest = val;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    __shared__ int shared[blockDim.x];\n    shared[threadIdx.x] = x[tid];\n    __syncthreads();\n    for (int i = 0; i < blockDim.x; i++) {\n      if (shared[threadIdx.x] > shared[i]) {\n        shared[threadIdx.x] = shared[i];\n      }\n    }\n    __syncthreads();\n    if (threadIdx.x == 0) {\n      if (k == blockIdx.x + 1) {\n        *kthSmallest = shared[threadIdx.x];\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    int my_kth_smallest = x[tid];\n    for (int i = tid + 1; i < N; i++) {\n      if (x[i] < my_kth_smallest) {\n        my_kth_smallest = x[i];\n      }\n    }\n    atomicMin(kthSmallest, my_kth_smallest);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  int tid_warp = tid % 32;\n  int warp_id = tid / 32;\n  int warp_kthSmallest = *kthSmallest;\n  for (int i = 1; i < 32; i *= 2) {\n    __syncthreads();\n    if (tid_warp < i) {\n      if (warp_kthSmallest > x[tid + i * blockDim.x]) {\n        warp_kthSmallest = x[tid + i * blockDim.x];\n      }\n    }\n  }\n  __syncthreads();\n  if (warp_id == 0) {\n    *kthSmallest = warp_kthSmallest;\n  }\n  __syncthreads();\n  if (tid == 0) {\n    for (int i = 1; i < gridDim.x; i++) {\n      int other_kthSmallest;\n      cudaMemcpy(&other_kthSmallest, &x[i * blockDim.x], sizeof(int), cudaMemcpyDeviceToDevice);\n      if (*kthSmallest > other_kthSmallest) {\n        *kthSmallest = other_kthSmallest;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    kthSmallest[tid] = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    std::sort(kthSmallest, kthSmallest + N);\n    *kthSmallest = kthSmallest[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    if (tid == k - 1) {\n      *kthSmallest = x[tid];\n    }\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x + blockIdx.x * blockDim.x;\n  if (tid < N) {\n    *kthSmallest = x[tid];\n  }\n  __syncthreads();\n  if (tid == 0) {\n    int temp;\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (x[i] > x[j]) {\n          temp = x[i];\n          x[i] = x[j];\n          x[j] = temp;\n        }\n      }\n    }\n    *kthSmallest = x[k - 1];\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int bid_size = blockDim.x;\n  int grid_size = gridDim.x;\n  int start_idx = bid * bid_size + tid;\n  int end_idx = min(start_idx + bid_size, N);\n  int local_kth_smallest = x[start_idx];\n  for (int i = start_idx + 1; i < end_idx; i++) {\n    if (x[i] < local_kth_smallest) {\n      local_kth_smallest = x[i];\n    }\n  }\n  atomicMin(kthSmallest, local_kth_smallest);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "41_sort_k-th_smallest_element",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Find the k-th smallest element of the vector x.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Return the result on rank 0.\n   Example:\n   \n   input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n   output: 6\n*/\nint findKthSmallest(std::vector<int> const& x, int k) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[k - 1];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Reduce the local k-th smallest elements to find the global k-th smallest element\n  MPI_Allreduce(&local_kth, &global_kth, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);\n\n  // Return the global k-th smallest element on rank 0\n  if (rank == 0) {\n    return global_kth;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector to store the sorted elements\n  std::vector<int> sorted_x(n);\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel for\n  for (int i = 0; i < n; i++) {\n    sorted_x[i] = x[i];\n  }\n  std::sort(sorted_x.begin(), sorted_x.end());\n\n  // Get the k-th smallest element\n  int kth_smallest = sorted_x[k - 1];\n\n  // Return the k-th smallest element on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    return kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int n = x.size();\n  int kth = -1;\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_k = k / size;\n  int local_start = rank * local_k;\n  int local_end = (rank + 1) * local_k;\n  if (local_end > n) {\n    local_end = n;\n  }\n  std::nth_element(x.begin() + local_start, x.begin() + local_start + local_k - 1, x.begin() + local_end);\n  int local_kth = x[local_start + local_k - 1];\n  MPI_Gather(&local_kth, 1, MPI_INT, &kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n  if (rank == 0) {\n    std::vector<int> kth_vec(size);\n    for (int i = 0; i < size; i++) {\n      kth_vec[i] = kth[i];\n    }\n    std::nth_element(kth_vec.begin(), kth_vec.begin() + k - 1, kth_vec.end());\n    kth = kth_vec[k - 1];\n  }\n  return kth;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int n = x.size();\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Find the local kth smallest element\n  std::nth_element(x.begin() + rank * x.size() / size,\n                   x.begin() + rank * x.size() / size + x.size() / size - 1,\n                   x.begin() + rank * x.size() / size + k - 1);\n  int local_kth = *(x.begin() + rank * x.size() / size + k - 1);\n\n  // Gather all local kth smallest elements\n  std::vector<int> all_local_kth(size);\n  MPI_Gather(&local_kth, 1, MPI_INT, all_local_kth.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global kth smallest element\n  if (rank == 0) {\n    std::nth_element(all_local_kth.begin(), all_local_kth.begin() + k - 1, all_local_kth.end());\n    return all_local_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = x.size();\n  int local_k = k / size;\n  int local_remainder = k % size;\n  int local_start = rank * local_k;\n  int local_end = local_start + local_k;\n  if (rank < local_remainder) {\n    local_start += rank;\n    local_end += rank + 1;\n  } else {\n    local_start += local_remainder;\n    local_end += local_remainder;\n  }\n\n  std::vector<int> local_x(x.begin() + local_start, x.begin() + local_end);\n  std::sort(local_x.begin(), local_x.end());\n\n  int local_kth_smallest = local_x[local_k - 1];\n  if (rank == 0) {\n    std::vector<int> kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest.begin(), kth_smallest.end());\n    return kth_smallest[k - 1];\n  } else {\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int kth_smallest;\n  MPI_Gather(&x[k - 1], 1, MPI_INT, &kth_smallest, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among the received elements\n  if (rank == 0) {\n    std::vector<int> received_elements(size);\n    MPI_Gather(&x[k - 1], 1, MPI_INT, received_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(received_elements.begin(), received_elements.end());\n    return received_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the global k-th smallest element on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  // Find the local k-th smallest element on each rank.\n  int local_kth_smallest = *std::max_element(x.begin(), x.end());\n  int global_kth_smallest;\n\n  // Find the global k-th smallest element.\n  MPI_Allreduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN);\n\n  // Find the k-th smallest element on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n    return x[k - 1];\n  } else {\n    return global_kth_smallest;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    global_kth_smallest = all_kth_smallest[k - 1];\n  }\n\n  // Return the result on rank 0\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int n = x.size();\n  int local_k = k / nprocs;\n  int local_remainder = k % nprocs;\n\n  // Find the local k-th smallest element on each rank\n  std::nth_element(x.begin(), x.begin() + local_k + (rank == nprocs - 1 ? local_remainder : 0), x.end());\n  int local_kth_smallest = x[local_k + (rank == nprocs - 1 ? local_remainder : 0)];\n\n  // Gather all local k-th smallest elements on rank 0\n  std::vector<int> kth_smallest_elements(nprocs);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element on rank 0\n  if (rank == 0) {\n    std::nth_element(kth_smallest_elements.begin(), kth_smallest_elements.begin() + k - 1, kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_kth = x[0];\n  int global_kth;\n\n  // Find the k-th smallest element on each rank\n  #pragma omp parallel reduction(min : local_kth)\n  {\n    #pragma omp for\n    for (int i = 1; i < x.size(); i++) {\n      if (x[i] < local_kth) {\n        local_kth = x[i];\n      }\n    }\n  }\n\n  // Gather the local k-th smallest elements from all ranks\n  MPI_Gather(&local_kth, 1, MPI_INT, &global_kth, 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Find the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> gathered_kth(size);\n    for (int i = 0; i < size; i++) {\n      gathered_kth[i] = global_kth[i];\n    }\n    std::sort(gathered_kth.begin(), gathered_kth.end());\n    return gathered_kth[k - 1];\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  if (rank == 0) {\n    std::vector<int> kth_smallest_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, kth_smallest_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(kth_smallest_elements.begin(), kth_smallest_elements.end());\n    return kth_smallest_elements[k - 1];\n  }\n\n  return -1;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element\n  int global_kth_smallest;\n  if (rank == 0) {\n    std::vector<int> gathered_elements(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, gathered_elements.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(gathered_elements.begin(), gathered_elements.end());\n    global_kth_smallest = gathered_elements[k - 1];\n  }\n\n  return global_kth_smallest;\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank sorts its own copy of x.\n  std::sort(x.begin(), x.end());\n\n  // Each rank sends its k-th smallest element to rank 0.\n  int local_kth_smallest = x[k - 1];\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the global k-th smallest element.\n  if (rank == 0) {\n    std::vector<int> all_kth_smallest(size);\n    MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n    std::sort(all_kth_smallest.begin(), all_kth_smallest.end());\n    return all_kth_smallest[k - 1];\n  }\n\n  return -1;  // Dummy return value for non-rank-0 processes.\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  int local_k = k / num_ranks;\n  int local_start = my_rank * local_k;\n  int local_end = (my_rank + 1) * local_k;\n\n  std::sort(x.begin() + local_start, x.begin() + local_end);\n\n  int local_kth_smallest = x[local_start + local_k - 1];\n\n  int global_kth_smallest;\n  MPI_Reduce(&local_kth_smallest, &global_kth_smallest, 1, MPI_INT, MPI_MIN, 0, MPI_COMM_WORLD);\n\n  if (my_rank == 0) {\n    return global_kth_smallest;\n  } else {\n    return -1;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the k-th smallest element of its local data\n  std::nth_element(x.begin(), x.begin() + k - 1, x.end());\n  int local_kth_smallest = x[k - 1];\n\n  // Gather all local k-th smallest elements to rank 0\n  std::vector<int> all_local_kth_smallest(size);\n  MPI_Gather(&local_kth_smallest, 1, MPI_INT, all_local_kth_smallest.data(), 1, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 finds the k-th smallest element among all local k-th smallest elements\n  if (rank == 0) {\n    std::nth_element(all_local_kth_smallest.begin(), all_local_kth_smallest.begin() + k - 1, all_local_kth_smallest.end());\n    return all_local_kth_smallest[k - 1];\n  } else {\n    return -1;  // Dummy return value for non-rank-0 processes\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(Kokkos::View<Result*> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA (const int& i, const int& j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n   using exec_space = KokKOS::EXECSPACE;\n   using device_type = KokKOS::DEVICE_TYPE;\n\n   auto policy = KokKOS::parallel_sort_ascending(results,\n                                                [](const Result &a, const Result &b) { return a.duration < b.duration; },\n                                                exec_space(),\n                                                device_type());\n   auto result = KokKOS::sort_view(results, policy);\n   results.copy(result);\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) -> bool { return results(i)->startTime < results(j)->startTime; });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int &i, const int &j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          return results(i)->startTime < results(j)->startTime;\n                        });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int& i, const int& j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n      return results(i)->startTime < results(j)->startTime;\n   });\n}\n",
+            "\n\n  Kokkos::parallel_sort(results.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return results(i)->startTime < results(j)->startTime;\n  });\n}\n",
+            "\n\n   Kokkos::sort(results, Kokkos::Experimental::compare_fun<Result>(\n                               [](const Result &a, const Result &b) { return a.startTime < b.startTime; }));\n}\n",
+            "\n\n   Kokkos::parallel_sort(results.extent(0),\n                         KOKKOS_LAMBDA(const int &i, const int &j) {\n                            return results(i)->startTime < results(j)->startTime;\n                         });\n}\n",
+            "\n\n  Kokkos::parallel_sort(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, results.extent(0)),\n                        [results](const int &i, const int &j) { return results(i)->startTime < results(j)->startTime; });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            ".",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = 0; j < results.size() - 1; j++) {\n      if (results[j].startTime > results[j + 1].startTime) {\n        std::swap(results[j], results[j + 1]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < static_cast<int>(results.size()); i++) {\n    for (int j = i + 1; j < static_cast<int>(results.size()); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size(); i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma opm parallel\n#pragma opm reduction(merge:results:opm_sort<Result>(results, [](const Result& a, const Result& b) { return a.startTime < b.startTime; }))\n{\n#pragma opm sections\n{\n#pragma opm section\n{\n  std::sort(results.begin(), results.end(), [](const Result& a, const Result& b) { return a.startTime < b.startTime; });\n}\n}\n}\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < results.size() - 1; i++) {\n    for (size_t j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel\n  {\n    #pragma opm for\n    for (int i = 0; i < results.size(); i++) {\n      for (int j = i + 1; j < results.size(); j++) {\n        if (results[i].startTime > results[j].startTime) {\n          std::swap(results[i], results[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size() - 1; i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < results.size(); i++) {\n    for (int j = i + 1; j < results.size(); j++) {\n      if (results[i].startTime > results[j].startTime) {\n        std::swap(results[i], results[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "serial",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004605472,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044822076
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045867768,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004503957
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045838732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004543511
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045345648,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045220328
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004560447,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004491932
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004572576,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045227602
+                    }
+                ]
+            },
+            {
+                "generated_output": "<*>",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.start < b.start;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004539614,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045369808
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045474752,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.004552951
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0046101426,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045096078
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045278414,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045250466
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0046032916,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0044908322
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045537264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045302986
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004491928,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045201174
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045246298,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045335244
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045722572,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045009892
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004587089,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045384516
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.004589127,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045425956
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n    return a.startTime < b.startTime;\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0045760746,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0045106834
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "hip",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int idx = threadIdx.x + blockIdx.x * blockDim.x;\n   if (idx >= N) {\n      return;\n   }\n\n   Result temp = results[idx];\n   int minIdx = idx;\n   for (int i = idx + 1; i < N; i++) {\n      if (results[i].startTime < results[minIdx].startTime) {\n         minIdx = i;\n      }\n   }\n\n   results[idx] = results[minIdx];\n   results[minIdx] = temp;\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   for (size_t i = threadIdx.x; i < N; i += blockDim.x) {\n      for (size_t j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) return;\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = tid + bid * hipBlockDim_x;\n   while (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n      i += hipBlockDim_x * hipGridDim_x;\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      for (int i = gid + 1; i < N; i++) {\n         if (results[gid].startTime > results[i].startTime) {\n            Result temp = results[gid];\n            results[gid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid < N) {\n      __shared__ Result sharedResults[256];\n      sharedResults[tid] = results[gid];\n      __syncthreads();\n      int stride = 256;\n      while (stride > 0) {\n         if (tid < stride && tid + stride < N && sharedResults[tid].startTime > sharedResults[tid + stride].startTime) {\n            Result temp = sharedResults[tid];\n            sharedResults[tid] = sharedResults[tid + stride];\n            sharedResults[tid + stride] = temp;\n         }\n         __syncthreads();\n         stride /= 2;\n      }\n      results[gid] = sharedResults[tid];\n   }\n}\n",
+            "\n\n    int tid = hipThreadIdx_x;\n    int bid = hipBlockIdx_x;\n    int gid = bid * hipBlockDim_x + tid;\n\n    if (gid < N) {\n        for (int i = gid + 1; i < N; i++) {\n            if (results[gid].startTime > results[i].startTime) {\n                Result temp = results[gid];\n                results[gid] = results[i];\n                results[i] = temp;\n            }\n        }\n    }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int i = bid * hipBlockDim_x + tid;\n   if (i < N) {\n      for (int j = i + 1; j < N; j++) {\n         if (results[i].startTime > results[j].startTime) {\n            Result temp = results[i];\n            results[i] = results[j];\n            results[j] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = hipThreadIdx_x;\n   int bid = hipBlockIdx_x;\n   int gid = bid * hipBlockDim_x + tid;\n   if (gid >= N) {\n      return;\n   }\n   for (int i = gid + hipBlockDim_x; i < N; i += hipBlockDim_x) {\n      if (results[gid].startTime > results[i].startTime) {\n         Result temp = results[gid];\n         results[gid] = results[i];\n         results[i] = temp;\n      }\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the output in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            ".",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the results on each rank\n  std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.start < b.start; });\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    std::vector<Result> gatheredResults;\n    for (int i = 1; i < size; i++) {\n      int numResults;\n      MPI_Recv(&numResults, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      std::vector<Result> recvResults(numResults);\n      MPI_Recv(recvResults.data(), numResults * sizeof(Result), MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      gatheredResults.insert(gatheredResults.end(), recvResults.begin(), recvResults.end());\n    }\n    results = gatheredResults;\n  } else {\n    // Send the number of results and the results to rank 0\n    int numResults = results.size();\n    MPI_Send(&numResults, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);\n    MPI_Send(results.data(), numResults * sizeof(Result), MPI_BYTE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_FLOAT, nullptr, 0, MPI_FLOAT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(),\n                [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the sorted results on rank 0\n   if (rank == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort locally\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather all results to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_DOUBLE, gatheredResults.data(), results.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "cuda",
+        "prompt": "struct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as there are elements.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\n__global__ void sortByStartTime(Result *results, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[i].startTime < results[tid].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            ".",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result tmp = results[tid];\n            results[tid] = results[i];\n            results[i] = tmp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n   int tid = blockIdx.x * blockDim.x + threadIdx.x;\n   if (tid < N) {\n      for (int i = tid + 1; i < N; i++) {\n         if (results[tid].startTime > results[i].startTime) {\n            Result temp = results[tid];\n            results[tid] = results[i];\n            results[i] = temp;\n         }\n      }\n   }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (results[tid].startTime > results[i].startTime) {\n        Result temp = results[tid];\n        results[tid] = results[i];\n        results[i] = temp;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "43_sort_sort_an_array_of_structs_by_key",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nstruct Result {\n   int startTime, duration;\n   float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of results. Store the outputs in results on rank 0.\n   Example:\n   \n   input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n   output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector<Result> &results) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks on rank 0\n   std::vector<Result> allResults(results.size() * numRanks);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, allResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n\n   // Sort all results on rank 0\n   if (myRank == 0) {\n      std::sort(allResults.begin(), allResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = allResults;\n   }\n}\n",
+            "\n\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  std::sort(results.begin(), results.end(),\n            [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n  // Gather all results to rank 0\n  std::vector<Result> allResults;\n  if (rank == 0) {\n    allResults.resize(results.size() * numRanks);\n  }\n  MPI_Gather(results.data(), results.size(), MPI_FLOAT, allResults.data(), results.size(),\n             MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n  // Sort all results on rank 0\n  if (rank == 0) {\n    std::sort(allResults.begin(), allResults.end(),\n              [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n    results = allResults;\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(size * results.size());\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, gatheredResults.data(), results.size(), MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n      results = std::move(gatheredResults);\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_NULL, nullptr, 0, MPI_DATATYPE_NULL, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather sorted results on rank 0\n   if (rank == 0) {\n      std::vector<Result> gatheredResults(results.size() * size);\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, gatheredResults.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n      results = gatheredResults;\n   } else {\n      MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the outputs in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int myRank, numRanks;\n   MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n   MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (myRank == 0) {\n      gatheredResults.resize(results.size() * numRanks);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (myRank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n  // Sort results by start time using MPI_Allgather and OpenMP\n  int numRanks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort results on each rank\n  #pragma openmp parallel\n  {\n    #pragma openmp for\n    for (int i = 0; i < results.size(); i++) {\n      std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) {\n        return a.startTime < b.startTime;\n      });\n    }\n  }\n\n  // Gather sorted results on rank 0\n  std::vector<Result> sortedResults;\n  if (rank == 0) {\n    int totalSize = results.size() * numRanks;\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, &results[0], results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n    results.resize(totalSize);\n  } else {\n    MPI_Gather(&results[0], results.size(), MPI_DATATYPE_RESULT, nullptr, 0, MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   if (rank == 0) {\n      gatheredResults.resize(results.size() * size);\n   }\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Sort gathered results on rank 0\n   if (rank == 0) {\n      std::sort(gatheredResults.begin(), gatheredResults.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   // Get the number of processes\n   int num_procs;\n   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n   // Get the rank of the process\n   int rank;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n   // Sort the results on each process\n   std::sort(results.begin(), results.end(),\n             [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather the sorted results from all processes\n   std::vector<Result> sorted_results(results.size() * num_procs);\n   MPI_Gather(results.data(), results.size(), MPI_DATATYPE_RESULT,\n              sorted_results.data(), results.size(), MPI_DATATYPE_RESULT, 0, MPI_COMM_WORLD);\n\n   // Copy the sorted results back to the input vector on rank 0\n   if (rank == 0) {\n      results = std::move(sorted_results);\n   }\n}\n",
+            "\n\n   // Sort results on each rank\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   // Gather results from all ranks to rank 0\n   std::vector<Result> gatheredResults;\n   MPI_Gather(results.data(), results.size(), MPI_FLOAT, gatheredResults.data(), results.size(), MPI_FLOAT, 0, MPI_COMM_WORLD);\n\n   // Store the gathered results in results on rank 0\n   if (MPI::COMM_WORLD.Get_rank() == 0) {\n      results = gatheredResults;\n   }\n}\n",
+            "\n\n   int rank, size;\n   MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n   MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n   std::sort(results.begin(), results.end(), [](const Result &a, const Result &b) { return a.startTime < b.startTime; });\n\n   if (rank == 0) {\n      for (int i = 1; i < size; i++) {\n         std::vector<Result> recvResults;\n         MPI_Recv(&recvResults, results.size(), MPI_FLOAT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n         results.insert(results.end(), recvResults.begin(), recvResults.end());\n      }\n   } else {\n      MPI_Send(&results, results.size(), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);\n   }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Sort the array x of complex numbers by their magnitude in ascending order.\n   Use Kokkos to sort in parallel. Assume Kokkos has already been initialized.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(Kokkos::View<Kokkos::complex<double>*> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), [&](int i, int j) -> bool {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return Kokkos::abs(x(i)) < Kokkos::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int &i, const int &j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_sort(x.extent(0), KOKKOS_LAMBDA(const int i, const int j) {\n    return std::abs(x(i)) < std::abs(x(j));\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use OpenMP to sort in parallel.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Sort the vector in parallel using OpenMP\n  #pragma omp parallel\n  {\n    // Get the number of threads\n    int num_threads = omp_get_num_threads();\n\n    // Get the thread ID\n    int thread_id = omp_get_thread_num();\n\n    // Sort the portion of the vector assigned to this thread\n    std::sort(x.begin() + thread_id, x.begin() + thread_id + num_threads);\n  }\n\n  // Merge the sorted portions of the vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "Magnitude:1. Moderato Moderato Moderato: 0. Moderato: Moderato: Moderato:forte forte forte piano piano piano piano piano piano piano: piano: piano: Moderato: piano: piano: forte forte forte: forte forte forte: forte: forte: piano piano piano piano piano piano: Piano: Piano: Moderato: Moderato: Moderato: piano: piano piano: forte forte forte: forte: forte: forte: Moderato:",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    x[i] = std::polar(std::abs(x[i]), std::arg(x[i]));\n  }\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel\n  {\n    #pragma omp for\n    for (int i = 0; i < x.size(); i++) {\n      for (int j = i + 1; j < x.size(); j++) {\n        if (std::abs(x[i]) > std::abs(x[j])) {\n          std::swap(x[i], x[j]);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); ++i) {\n    for (size_t j = i + 1; j < x.size(); ++j) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < x.size(); i++) {\n    for (size_t j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "(x)",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::complex<double> temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n  // Sort the vector x by the magnitude of its elements in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n",
+            "\n\n#pragma omp parallel for\n  for (int i = 0; i < x.size(); i++) {\n    for (int j = i + 1; j < x.size(); j++) {\n      if (std::abs(x[i]) > std::abs(x[j])) {\n        std::swap(x[i], x[j]);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "serial",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0791514982,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794029612
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0795447402,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796052798
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079481491,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799286426
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0796826138,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0795575528
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798698244,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0797476522
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0797245226,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796607124
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0793483788,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0790219044
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0795242578,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792881832
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0799757776,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793646474
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798781898,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079408057
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798671732,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079857739
+                    }
+                ]
+            },
+            {
+                "generated_output": ":",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079468721,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0792738084
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return abs(a) < abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0797225288,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0797027596
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0790590178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.079505324
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798643554,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0793960542
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.079497449,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0796431192
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0796809528,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794151424
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0798959932,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0799274124
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0797476202,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0794739994
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "hip",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(hipDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double mag = hipCrealf(x[tid]) * hipCrealf(x[tid]) + hipCrealf(x[tid]) * hipCrealf(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        hipDoubleComplex temp_c = x[tid];\n        x[tid] = x[i];\n        x[i] = temp_c;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int i = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (i < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hip_abs(x[i]);\n\n    // Sort the complex number by its magnitude\n    for (int j = i + 1; j < N; j++) {\n      double magnitude2 = hip_abs(x[j]);\n      if (magnitude > magnitude2) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int gid = bid * hipBlockDim_x + tid;\n\n  if (gid < N) {\n    double mag = hip_abs(x[gid]);\n    x[gid] = make_hipDoubleComplex(mag, 0.0);\n  }\n}\n",
+            "\n\n  size_t tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hip_abs(x[tid]);\n    x[tid] = make_hipDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  hipSort(x, x + N, hipDoubleComplexGreater);\n  __syncthreads();\n  if (tid < N) {\n    double real = hip_real(x[tid]);\n    double imag = hip_imag(x[tid]);\n    x[tid] = make_hipDoubleComplex(real, imag);\n  }\n}\n",
+            "\n\n  // Get thread and block indices\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int bid_size = hipBlockDim_x;\n\n  // Compute the index of the element to be processed by this thread\n  int index = bid * bid_size + tid;\n\n  // If the index is out of range, exit\n  if (index >= N) {\n    return;\n  }\n\n  // Get the current element\n  hipDoubleComplex current_element = x[index];\n\n  // Compute the magnitude of the current element\n  double magnitude = hip_abs(current_element);\n\n  // Find the index of the element with the smallest magnitude so far\n  int min_index = index;\n  for (int i = index + 1; i < N; i++) {\n    double current_magnitude = hip_abs(x[i]);\n    if (current_magnitude < magnitude) {\n      min_index = i;\n      magnitude = current_magnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude so far\n  hipDoubleComplex temp = x[index];\n  x[index] = x[min_index];\n  x[min_index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = hipMag(x[tid]);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, x + N, hipDoubleComplexCompare);\n  }\n}\n",
+            "\n\n  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    hipDoubleComplex c = x[tid];\n    x[tid] = hipDoubleComplex{sqrt(c.x * c.x + c.y * c.y), 0.0};\n  }\n}\n",
+            "\n\n  // Sort the array using the radix sort algorithm\n  for (int i = 0; i < N; i++) {\n    int key = abs(x[i]);\n    int j = i - 1;\n\n    while (j >= 0 && abs(x[j]) > key) {\n      x[j + 1] = x[j];\n      j--;\n    }\n    x[j + 1] = hipDoubleComplex{key, 0};\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local part of the array\n  std::sort(x + tid, x + tid + hipBlockDim_x);\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int stride = hipBlockDim_x; stride < N; stride *= 2) {\n    for (int i = 0; i < N; i += 2 * stride) {\n      if (hipAbs(x[i]) > hipAbs(x[i + stride])) {\n        std::swap(x[i], x[i + stride]);\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += hipBlockDim_x * hipGridDim_x;\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Get the block index\n  int bid = hipBlockIdx_x;\n\n  // Calculate the index of the element to be sorted\n  int index = bid * hipBlockDim_x + tid;\n\n  // Check if the index is within the bounds of the array\n  if (index < N) {\n    // Get the magnitude of the complex number\n    double magnitude = hipMag(x[index]);\n\n    // Sort the complex number by its magnitude\n    for (int i = index + 1; i < N; i++) {\n      double magnitude_i = hipMag(x[i]);\n      if (magnitude > magnitude_i) {\n        // Swap the complex numbers\n        hipDoubleComplex temp = x[index];\n        x[index] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the complex number\n        magnitude = magnitude_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + hipBlockDim_x; j < N; j += hipBlockDim_x) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  // Compute the magnitude of each complex number\n  float *mag = new float[N];\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes in ascending order\n  std::sort(mag, mag + N);\n\n  // Sort the complex numbers by their magnitude\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (mag[i] == sqrt(x[j].x * x[j].x + x[j].y * x[j].y)) {\n        x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  delete[] mag;\n}\n",
+            "\n\n  // Calculate the magnitude of each complex number\n  float *mag = (float *)malloc(N * sizeof(float));\n  for (size_t i = 0; i < N; i++) {\n    mag[i] = sqrtf(x[i].x * x[i].x + x[i].y * x[i].y);\n  }\n\n  // Sort the magnitudes using the hipSort function\n  hipSort(mag, N, hipFloat);\n\n  // Sort the complex numbers based on the sorted magnitudes\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (mag[i] > mag[j]) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  free(mag);\n}\n",
+            "\n\n  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (tid < N) {\n    double magnitude = hipMagnitude(x[tid]);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    hipSort(x, N, sizeof(hipDoubleComplex), hipDoubleComplexMagnitudeCompare);\n  }\n}\n",
+            "\n\n  // Sort the vector x by magnitude in ascending order.\n  // Use AMD HIP to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n  // Example:\n  //\n  // input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n  // output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n\n  // Get the thread index.\n  int tid = hipThreadIdx_x;\n\n  // Get the block index.\n  int bid = hipBlockIdx_x;\n\n  // Get the block size.\n  int bsize = hipBlockDim_x;\n\n  // Get the grid size.\n  int gsize = hipGridDim_x;\n\n  // Calculate the starting index for this thread.\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread.\n  int end = min(start + bsize, N);\n\n  // Sort the elements in this thread's range.\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) { return abs(a) < abs(b); });\n\n  // Synchronize all threads.\n  hipDeviceSynchronize();\n\n  // If this is the first thread in the grid, merge the sorted ranges.\n  if (tid == 0) {\n    for (int i = 1; i < gsize; i++) {\n      // Get the starting index for this thread.\n      int start = i * bsize;\n\n      // Get the ending index for this thread.\n      int end = min(start + bsize, N);\n\n      // Merge the sorted ranges.\n      merge(x, x + start, x + start, x + end, x);\n    }\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Get the block ID\n  int bid = hipBlockIdx_x;\n\n  // Get the block size\n  int bsize = hipBlockDim_x;\n\n  // Get the number of blocks\n  int nblocks = hipGridDim_x;\n\n  // Calculate the starting index for this thread\n  int start = tid + bid * bsize;\n\n  // Calculate the ending index for this thread\n  int end = min(start + bsize, N);\n\n  // Sort the local portion of the array\n  sort(x + start, x + end, [](hipDoubleComplex a, hipDoubleComplex b) {\n    return abs(a) < abs(b);\n  });\n\n  // Synchronize all threads\n  hipDeviceSynchronize();\n\n  // Merge the sorted local portions of the array\n  for (int i = 1; i < nblocks; i *= 2) {\n    if (bid % (2 * i) == 0) {\n      int partner = bid + i;\n      if (partner < nblocks) {\n        int start = i * bsize;\n        int end = min(start + 2 * bsize, N);\n        merge(x + start, x + start + bsize, x + start + bsize, x + end,\n              x + start);\n      }\n    }\n    hipDeviceSynchronize();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = hipThreadIdx_x;\n\n  // Sort the local elements\n  for (int i = tid; i < N; i += hipBlockDim_x) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n\n  // Synchronize the threads\n  __syncthreads();\n\n  // Merge the sorted local elements\n  for (int s = 1; s < N; s *= 2) {\n    for (int i = tid; i < N; i += 2 * s) {\n      int j = i + s;\n      if (j < N && hipAbs(x[i]) > hipAbs(x[j])) {\n        hipDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ hipDoubleComplex block[1024];\n  block[threadIdx.x] = x[threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (hipAbs(block[i]) > hipAbs(block[j])) {\n        hipDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[threadIdx.x] = block[threadIdx.x];\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered;\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, store the gathered vector in x\n  if (rank == 0) {\n    x = gathered;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the current process\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector of complex numbers\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  if (rank == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vectors on rank 0\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vectors from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&magnitudes](std::complex<double> a, std::complex<double> b) {\n    return magnitudes[a] < magnitudes[b];\n  });\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv;\n      MPI::COMM_WORLD.Recv(recv, i, 0);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(x, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Collect the sorted vectors on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<std::complex<double>> recv;\n      MPI_Recv(&recv, x.size(), MPI_COMPLEX, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      x.insert(x.end(), recv.begin(), recv.end());\n    }\n  } else {\n    MPI_Send(&x[0], x.size(), MPI_COMPLEX, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * size);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = std::move(sorted_x);\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks\n  std::vector<std::complex<double>> gathered_x(x.size() * size);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector\n  if (rank == 0) {\n    std::sort(gathered_x.begin(), gathered_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector to rank 0\n  MPI_Scatter(gathered_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Replace the original vector on rank 0 with the sorted vector\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  MPI_Comm comm = MPI_COMM_WORLD;\n  int rank, size;\n  MPI_Comm_rank(comm, &rank);\n  MPI_Comm_size(comm, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted;\n  if (rank == 0) {\n    sorted.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(sorted.begin(), sorted.end(), [](std::complex<double> a, std::complex<double> b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(sorted.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, comm);\n}\n",
+            "\n\n  // Get the size and rank of the communicator\n  int size, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(size * x.size());\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  MPI_Gather(&sorted_x[0], n, MPI_DOUBLE_COMPLEX, &x[0], n, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    // Get the magnitude of the complex number\n    double magnitude = std::abs(x[i]);\n    // Store the magnitude in the vector\n    mag[i] = magnitude;\n  }\n\n  // Sort the vector of magnitudes\n  std::sort(mag.begin(), mag.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::sort(x.begin(), x.end(), [&](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n}\n",
+            "\n\n  // Get the size of the vector\n  int n = x.size();\n\n  // Create a vector of magnitudes\n  std::vector<double> magnitudes(n);\n  for (int i = 0; i < n; i++) {\n    magnitudes[i] = std::abs(x[i]);\n  }\n\n  // Sort the magnitudes\n  std::sort(magnitudes.begin(), magnitudes.end());\n\n  // Sort the vector of complex numbers by magnitude\n  std::vector<std::complex<double>> sorted_x(n);\n  for (int i = 0; i < n; i++) {\n    for (int j = 0; j < n; j++) {\n      if (std::abs(x[j]) == magnitudes[i]) {\n        sorted_x[i] = x[j];\n        break;\n      }\n    }\n  }\n\n  // Gather the sorted vector on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); i++) {\n      std::vector<std::complex<double>> recv_x(n);\n      MPI::COMM_WORLD.Recv(recv_x.data(), n, MPI::DOUBLE, i, 0);\n      sorted_x.insert(sorted_x.end(), recv_x.begin(), recv_x.end());\n    }\n  } else {\n    MPI::COMM_WORLD.Send(sorted_x.data(), n, MPI::DOUBLE, 0, 0);\n  }\n\n  // Update the vector x on rank 0\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    x = sorted_x;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "cuda",
+        "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use CUDA to sort in parallel. The kernel is launched with at least as many threads as elements in x.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\n__global__ void sortComplexByMagnitude(cuDoubleComplex *x, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    cuDoubleComplex tmp = x[tid];\n    int i = tid;\n    while (i > 0 && cuCabs(x[i - 1]) > cuCabs(tmp)) {\n      x[i] = x[i - 1];\n      i--;\n    }\n    x[i] = tmp;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = x[tid].y;\n    x[tid].y = 0.0;\n  }\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block_data[256];\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int block_size = blockDim.x;\n  int start = bid * block_size;\n  int end = min(start + block_size, N);\n  for (int i = start; i < end; i++) {\n    block_data[tid] = x[i];\n    __syncthreads();\n    for (int j = tid + 1; j < block_size; j++) {\n      if (abs(block_data[tid]) > abs(block_data[j])) {\n        cuDoubleComplex temp = block_data[tid];\n        block_data[tid] = block_data[j];\n        block_data[j] = temp;\n      }\n    }\n    __syncthreads();\n  }\n\n  // Copy the sorted elements back to the global memory\n  for (int i = start; i < end; i++) {\n    x[i] = block_data[tid];\n    __syncthreads();\n  }\n}\n",
+            "\n\n  // Get the thread ID\n  int tid = threadIdx.x;\n\n  // Sort the local part of the array\n  thrust::sort(x + tid, x + tid + min(N - tid, blockDim.x),\n               [](cuDoubleComplex a, cuDoubleComplex b) { return cuCabs(a) < cuCabs(b); });\n\n  // Synchronize threads\n  __syncthreads();\n\n  // Merge the sorted parts of the array\n  for (int s = blockDim.x; s <= N; s *= 2) {\n    if (tid < N - s) {\n      cuDoubleComplex a = x[tid];\n      cuDoubleComplex b = x[tid + s];\n      if (cuCabs(a) > cuCabs(b)) {\n        x[tid] = b;\n        x[tid + s] = a;\n      }\n    }\n    __syncthreads();\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  }\n}\n",
+            "\n\n  // Get the thread index\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the thread index is within the bounds of the array, sort the element\n  if (tid < N) {\n    // Calculate the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n\n    // Sort the element by magnitude\n    for (int i = tid + 1; i < N; i++) {\n      double otherMagnitude = sqrt(x[i].x * x[i].x + x[i].y * x[i].y);\n      if (magnitude > otherMagnitude) {\n        // Swap the elements\n        cuDoubleComplex temp = x[tid];\n        x[tid] = x[i];\n        x[i] = temp;\n\n        // Update the magnitude of the current element\n        magnitude = otherMagnitude;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = mag;\n    x[tid].y = 0.0;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n      }\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid].x = creal(x[tid]);\n    x[tid].y = cimag(x[tid]);\n  }\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n  while (i < N) {\n    cuDoubleComplex temp;\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    x[tid].x = magnitude;\n  }\n  __syncthreads();\n  if (tid < N) {\n    for (int i = tid + 1; i < N; i++) {\n      if (x[tid].x > x[i].x) {\n        double temp = x[tid].x;\n        x[tid].x = x[i].x;\n        x[i].x = temp;\n        cuDoubleComplex temp2 = x[tid];\n        x[tid] = x[i];\n        x[i] = temp2;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    for (int i = tid + 1; i < N; i++) {\n      double mag_i = cuCabs(x[i]);\n      if (mag > mag_i) {\n        cuDoubleComplex tmp = x[tid];\n        x[tid] = x[i];\n        x[i] = tmp;\n        mag = mag_i;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  // Compute the magnitude of the complex number\n  double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n  \n  // Store the magnitude in shared memory\n  __shared__ double magnitudes[N];\n  magnitudes[tid] = magnitude;\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Sort the magnitudes in shared memory\n  for (int i = 0; i < N; i++) {\n    for (int j = i + 1; j < N; j++) {\n      if (magnitudes[i] > magnitudes[j]) {\n        double temp = magnitudes[i];\n        magnitudes[i] = magnitudes[j];\n        magnitudes[j] = temp;\n      }\n    }\n  }\n  \n  // Synchronize threads\n  __syncthreads();\n  \n  // Find the index of the complex number with the given magnitude\n  int index = -1;\n  for (int i = 0; i < N; i++) {\n    if (magnitudes[i] == magnitude) {\n      index = i;\n      break;\n    }\n  }\n  \n  // Swap the complex numbers\n  cuDoubleComplex temp = x[tid];\n  x[tid] = x[index];\n  x[index] = temp;\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid >= N) return;\n  \n  double magnitude = cuCabs(x[tid]);\n  x[tid] = cuCmul(x[tid], cuCdiv(1.0, magnitude));\n  \n  __shared__ double shared_magnitude[blockDim.x];\n  shared_magnitude[threadIdx.x] = magnitude;\n  __syncthreads();\n  \n  for (int i = 0; i < blockDim.x; i++) {\n    if (tid + i + 1 < N && shared_magnitude[tid] > shared_magnitude[tid + i + 1]) {\n      double temp = shared_magnitude[tid];\n      shared_magnitude[tid] = shared_magnitude[tid + i + 1];\n      shared_magnitude[tid + i + 1] = temp;\n      \n      cuDoubleComplex temp_complex = x[tid];\n      x[tid] = x[tid + i + 1];\n      x[tid + i + 1] = temp_complex;\n    }\n  }\n  \n}\n",
+            "\n\n  // Each thread sorts a single element of x.\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number.\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in the complex number.\n    x[tid].x = magnitude;\n  }\n  // Sort the complex numbers by their magnitude.\n  thrust::sort(x, x + N, thrust::less<cuDoubleComplex>());\n}\n",
+            "\n\n  // Sort the elements in the current block\n  __shared__ cuDoubleComplex block[32];\n  block[threadIdx.x] = x[blockIdx.x * blockDim.x + threadIdx.x];\n  __syncthreads();\n  for (int i = 0; i < blockDim.x; i++) {\n    for (int j = i + 1; j < blockDim.x; j++) {\n      if (cuCabsf(block[i]) > cuCabsf(block[j])) {\n        cuDoubleComplex temp = block[i];\n        block[i] = block[j];\n        block[j] = temp;\n      }\n    }\n  }\n  __syncthreads();\n\n  // Copy the sorted elements back to the global memory\n  x[blockIdx.x * blockDim.x + threadIdx.x] = block[threadIdx.x];\n}\n",
+            "\n\n  int tid = threadIdx.x;\n  int bid = blockIdx.x;\n  int i = tid + bid * blockDim.x;\n\n  while (i < N) {\n    for (int j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n    i += blockDim.x * gridDim.x;\n  }\n}\n",
+            "\n\n  // Each thread sorts a single element\n  int tid = threadIdx.x;\n  if (tid < N) {\n    // Compute the magnitude of the complex number\n    double magnitude = sqrt(x[tid].x * x[tid].x + x[tid].y * x[tid].y);\n    // Store the magnitude in shared memory\n    __shared__ double magnitudes[N];\n    magnitudes[tid] = magnitude;\n    // Synchronize threads\n    __syncthreads();\n    // Sort the magnitudes in shared memory\n    for (int i = 0; i < N; i++) {\n      for (int j = i + 1; j < N; j++) {\n        if (magnitudes[i] > magnitudes[j]) {\n          double temp = magnitudes[i];\n          magnitudes[i] = magnitudes[j];\n          magnitudes[j] = temp;\n        }\n      }\n    }\n    // Synchronize threads\n    __syncthreads();\n    // Find the index of the sorted magnitude in the original array\n    int index = -1;\n    for (int i = 0; i < N; i++) {\n      if (magnitudes[tid] == sqrt(x[i].x * x[i].x + x[i].y * x[i].y)) {\n        index = i;\n        break;\n      }\n    }\n    // Swap the complex number at the current index with the complex number at the sorted index\n    if (index != -1) {\n      cuDoubleComplex temp = x[tid];\n      x[tid] = x[index];\n      x[index] = temp;\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double magnitude = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(magnitude, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(creal(x[tid]), cimag(x[tid]));\n  }\n}\n",
+            "\n\n  // Get the index of the current element\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n\n  // If the index is out of bounds, return\n  if (idx >= N) {\n    return;\n  }\n\n  // Get the magnitude of the current element\n  double magnitude = abs(x[idx]);\n\n  // Find the index of the element with the smallest magnitude\n  int minIdx = idx;\n  for (int i = idx + 1; i < N; i++) {\n    double otherMagnitude = abs(x[i]);\n    if (otherMagnitude < magnitude) {\n      minIdx = i;\n      magnitude = otherMagnitude;\n    }\n  }\n\n  // Swap the current element with the element with the smallest magnitude\n  if (minIdx != idx) {\n    double temp = x[idx];\n    x[idx] = x[minIdx];\n    x[minIdx] = temp;\n  }\n}\n",
+            "\n\n  // Sort the complex numbers by their magnitude in ascending order.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = i + 1; j < N; j++) {\n      if (cuCabs(x[i]) > cuCabs(x[j])) {\n        cuDoubleComplex temp = x[i];\n        x[i] = x[j];\n        x[j] = temp;\n      }\n    }\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    double mag = cuCabs(x[tid]);\n    x[tid] = make_cuDoubleComplex(mag, 0.0);\n  }\n  __syncthreads();\n  if (tid < N) {\n    int i = tid;\n    while (i > 0 && cuCreal(x[i]) < cuCreal(x[i - 1])) {\n      swap(x[i], x[i - 1]);\n      i--;\n    }\n  }\n  __syncthreads();\n  if (tid < N) {\n    x[tid] = make_cuDoubleComplex(cuCreal(x[tid]), cuCimag(x[tid]));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "sort",
+        "language": "cpp",
+        "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Sort the vector x of complex numbers by their magnitude in ascending order.\n   Use MPI and OpenMP to sort in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of x. Store the result in x on rank 0.\n   Example:\n   \n   input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n   output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector<std::complex<double>> &x) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors on rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector of complex numbers.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the local vector\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted local vectors to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(size * x.size());\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> gathered;\n  if (rank == 0) {\n    gathered.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector on rank 0\n  if (rank == 0) {\n    std::sort(gathered.begin(), gathered.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks\n  MPI_Scatter(gathered.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector x on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Copy the sorted vector to x on rank 0.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, nranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nranks);\n\n  // Sort the vector x on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vector x from all ranks on rank 0\n  std::vector<std::complex<double>> sorted_x(x.size() * nranks);\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_PRECISION, sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n\n  // Sort the gathered vector x on rank 0\n  std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Scatter the sorted vector x from rank 0 to all ranks\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_PRECISION, x.data(), x.size(), MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector on each rank\n  std::sort(x.begin(), x.end(), [](std::complex<double> a, std::complex<double> b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector of complex numbers on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  if (rank == 0) {\n    std::vector<std::complex<double>> sorted_x(x.size() * num_ranks);\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n    x = sorted_x;\n  } else {\n    MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, nullptr, 0, MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the local vector.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all other ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * size);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // Copy the sorted vector to rank 0's x\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current process\n  int nprocs, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * nprocs);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vectors and store the result in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks to rank 0\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, sort the gathered vector and store it in x\n  if (rank == 0) {\n    std::sort(sorted_x.begin(), sorted_x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n      return std::abs(a) < std::abs(b);\n    });\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Sort the vector x on each rank by magnitude in ascending order.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> gathered_x(x.size() * MPI::world().size());\n  MPI::world().all_gather(x.data(), x.size(), MPI::DOUBLE_REAL, gathered_x.data(), x.size(), MPI::DOUBLE_REAL);\n\n  // Replace the vector x on rank 0 with the gathered sorted vectors.\n  if (MPI::world().rank() == 0) {\n    x = gathered_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks onto rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If rank 0, copy the sorted vector back to x.\n  if (rank == 0) {\n    x = sorted_x;\n  }\n}\n",
+            "\n\n  // Get the number of ranks and the rank of the current rank.\n  int num_ranks, rank;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Sort the vector locally on each rank.\n  std::sort(x.begin(), x.end(), [](const std::complex<double> &a, const std::complex<double> &b) {\n    return std::abs(a) < std::abs(b);\n  });\n\n  // Gather the sorted vectors from all ranks on rank 0.\n  std::vector<std::complex<double>> sorted_x;\n  if (rank == 0) {\n    sorted_x.resize(x.size() * num_ranks);\n  }\n  MPI_Gather(x.data(), x.size(), MPI_DOUBLE_COMPLEX, sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n\n  // If the current rank is not 0, clear the vector x.\n  if (rank != 0) {\n    x.clear();\n  }\n\n  // Scatter the sorted vector from rank 0 to all ranks.\n  MPI_Scatter(sorted_x.data(), x.size(), MPI_DOUBLE_COMPLEX, x.data(), x.size(), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n                           for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n                             if (input(ii, jj) == 1) { num_neighbors++; }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ii++) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); jj++) {\n        if (ii == i && jj == j) continue;\n        num_ones += (input(ii, jj) == 1);\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_ones = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++num_ones;\n        }\n      }\n    }\n    output(i, j) = (num_ones == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                        KOKKOS_LAMBDA(const int i, const int j) {\n                          int num_neighbors = 0;\n                          if (i > 0) num_neighbors += input(i - 1, j);\n                          if (i < N - 1) num_neighbors += input(i + 1, j);\n                          if (j > 0) num_neighbors += input(i, j - 1);\n                          if (j < N - 1) num_neighbors += input(i, j + 1);\n                          output(i, j) = (num_neighbors == 1);\n                        });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int numNeighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ++ii) {\n                           for (int jj = j - 1; jj <= j + 1; ++jj) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               ++numNeighbors;\n                             }\n                           }\n                         }\n                         output(i, j) = (numNeighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "Kokkoss_t_t",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input(i - 1, j);\n      if (i < N - 1) num_neighbors += input(i + 1, j);\n      if (j > 0) num_neighbors += input(i, j - 1);\n      if (j < N - 1) num_neighbors += input(i, j + 1);\n      output(i, j) = (num_neighbors == 1);\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const size_t i, const size_t j) {\n                         int sum = 0;\n                         for (int di = -1; di <= 1; di++) {\n                           for (int dj = -1; dj <= 1; dj++) {\n                             if (di == 0 && dj == 0) {\n                               continue;\n                             }\n                             int ii = i + di;\n                             int jj = j + dj;\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n                               sum += input(ii, jj);\n                             }\n                           }\n                         }\n                         output(i, j) = (sum == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) continue;\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int di = -1; di <= 1; di++) {\n      for (int dj = -1; dj <= 1; dj++) {\n        if (di == 0 && dj == 0) {\n          continue;\n        }\n        int ii = i + di;\n        int jj = j + dj;\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         if (i > 0) num_neighbors += input(i - 1, j);\n                         if (i < N - 1) num_neighbors += input(i + 1, j);\n                         if (j > 0) num_neighbors += input(i, j - 1);\n                         if (j < N - 1) num_neighbors += input(i, j + 1);\n                         output(i, j) = (num_neighbors == 1);\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t& j) {\n      int count = 0;\n      if (i > 0) count += input(i - 1, j);\n      if (i < N - 1) count += input(i + 1, j);\n      if (j > 0) count += input(i, j - 1);\n      if (j < N - 1) count += input(i, j + 1);\n      output(i, j) = (count == 1);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { sum += input(ii, jj); }\n      }\n    }\n    output(i, j) = (sum == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (ii == i && jj == j) continue;\n        num_neighbors += input(ii, jj);\n      }\n    }\n    output(i, j) = (num_neighbors == 1);\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cellsXOR\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N * N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int num_neighbors = 0;\n                         for (int ii = i - 1; ii <= i + 1; ii++) {\n                           for (int jj = j - 1; jj <= j + 1; jj++) {\n                             if (ii >= 0 && ii < N && jj >= 0 && jj < N && input(ii, jj) == 1) {\n                               num_neighbors++;\n                             }\n                           }\n                         }\n                         output(i, j) = (num_neighbors == 1) ? 1 : 0;\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"cells\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = std::max(0, i - 1); ii <= std::min(N - 1, i + 1); ++ii) {\n      for (int jj = std::max(0, j - 1); jj <= std::min(N - 1, j + 1); ++jj) {\n        if (input(ii, jj) == 1) {\n          ++count;\n        }\n      }\n    }\n    output(i, j) = (count == 1);\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) {\n        count += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        count += input[i + 1][j];\n      }\n      if (j > 0) {\n        count += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        count += input[i][j + 1];\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma opm simd\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) count++;\n      if (i < N - 1 && input[i + 1][j] == 1) count++;\n      if (j > 0 && input[i][j - 1] == 1) count++;\n      if (j < N - 1 && input[i][j + 1] == 1) count++;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1 + j * N];\n      if (i < N - 1) num_neighbors += input[i + 1 + j * N];\n      if (j > 0) num_neighbors += input[i + (j - 1) * N];\n      if (j < N - 1) num_neighbors += input[i + (j + 1) * N];\n      output[i + j * N] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      if (i > 0) num_ones += input[i - 1][j];\n      if (i < N - 1) num_ones += input[i + 1][j];\n      if (j > 0) num_ones += input[i][j - 1];\n      if (j < N - 1) num_ones += input[i][j + 1];\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      size_t num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "serial",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++count;\n      if (i < N - 1 && input[i + 1][j] == 1) ++count;\n      if (j > 0 && input[i][j - 1] == 1) ++count;\n      if (j < N - 1 && input[i][j + 1] == 1) ++count;\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (i + di >= 0 && i + di < N && j + dj >= 0 && j + dj < N) {\n            num_ones += input[(i + di) * N + j + dj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t k = i - 1; k <= i + 1; k++) {\n        for (size_t l = j - 1; l <= j + 1; l++) {\n          if (k >= 0 && k < N && l >= 0 && l < N && (k != i || l != j)) {\n            num_neighbors += input[k * N + l];\n          }\n        }\n      }\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j] == 1) ++num_neighbors;\n      if (j > 0 && input[i][j - 1] == 1) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1] == 1) ++num_neighbors;\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": false,
+                "is_source_valid": true,
+                "did_any_run": false,
+                "did_all_run": false,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": null
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "hip",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) { sum += input[ny * N + nx]; }\n    }\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = count == 1 ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    if (x > 0) num_neighbors += input[y * N + x - 1];\n    if (x < N - 1) num_neighbors += input[y * N + x + 1];\n    if (y > 0) num_neighbors += input[(y - 1) * N + x];\n    if (y < N - 1) num_neighbors += input[(y + 1) * N + x];\n    output[y * N + x] = (num_neighbors == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_ones = 0;\n    if (i > 0) num_ones += input[i - 1 + j * N];\n    if (i < N - 1) num_ones += input[i + 1 + j * N];\n    if (j > 0) num_ones += input[i + (j - 1) * N];\n    if (j < N - 1) num_ones += input[i + (j + 1) * N];\n    output[i + j * N] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_ones = 0;\n  if (x - 1 >= 0) num_ones += input[(x - 1) * N + y];\n  if (x + 1 < N) num_ones += input[(x + 1) * N + y];\n  if (y - 1 >= 0) num_ones += input[x * N + y - 1];\n  if (y + 1 < N) num_ones += input[x * N + y + 1];\n  output[x * N + y] = (num_ones == 1);\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int sum = 0;\n  int idx = blockIdx.x * blockDim.x + threadIdx.x;\n  int idy = blockIdx.y * blockDim.y + threadIdx.y;\n  if (idx < N && idy < N) {\n    int left = idx > 0 ? input[idx - 1 + idy * N] : 0;\n    int right = idx < N - 1 ? input[idx + 1 + idy * N] : 0;\n    int up = idy > 0 ? input[idx + (idy - 1) * N] : 0;\n    int down = idy < N - 1 ? input[idx + (idy + 1) * N] : 0;\n    sum = left + right + up + down;\n  }\n  output[idx + idy * N] = input[idx + idy * N] ? (sum == 1 ? 1 : 0) : 0;\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  int y = hipThreadIdx_y + hipBlockIdx_y * hipBlockDim_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  if (x > 0) {\n    count += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    count += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    count += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    count += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (count == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t num_rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * num_rows_per_rank;\n  size_t end_row = std::min((rank + 1) * num_rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, (end_row - start_row) * N, MPI_INT,\n             output.data(), (end_row - start_row) * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> rowXOR(N, 0);\n  for (size_t j = 0; j < N; ++j) {\n    if (input[rank * N + j] == 1) {\n      rowXOR[j] = 1;\n    }\n  }\n\n  // Reduce the XORs of all rows to rank 0\n  std::vector<int> allRowXORs(N, 0);\n  MPI_Reduce(rowXOR.data(), allRowXORs.data(), N, MPI_INT, MPI_BXOR, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the final output\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0 && allRowXORs[i - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (i < N - 1 && allRowXORs[i + 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j > 0 && input[i * N + j - 1] == 1) {\n          ++numNeighbors;\n        }\n        if (j < N - 1 && input[i * N + j + 1] == 1) {\n          ++numNeighbors;\n        }\n        output[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n  std::vector<int> localInput(nRowsPerRank * N);\n  std::vector<int> localOutput(nRowsPerRank * N);\n  MPI_Scatter(input.data(), N * N, MPI_INT, localInput.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int numNeighbors = 0;\n      if (i > 0) numNeighbors += localInput[(i - 1) * N + j];\n      if (i < N - 1) numNeighbors += localInput[(i + 1) * N + j];\n      if (j > 0) numNeighbors += localInput[i * N + j - 1];\n      if (j < N - 1) numNeighbors += localInput[i * N + j + 1];\n      localOutput[i * N + j] = (numNeighbors == 1);\n    }\n  }\n  MPI_Gather(localOutput.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Every rank computes the XOR of its own cells\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Check the neighbors\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] ^= recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  std::vector<int> local_output(N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    int count = 0;\n    for (size_t j = 0; j < N; ++j) {\n      if (i == j) {\n        continue;\n      }\n      count += input[i * N + j];\n    }\n    local_output[i] = count == 1 ? 1 : 0;\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(local_output.data(), N, MPI_INT, output.data(), N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int rows_per_rank = (N + size - 1) / size;\n      int start_row = i * rows_per_rank;\n      int end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather all the results to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      std::vector<int> recv;\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own row\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the XOR of its own rows\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] ^= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of neighbors that are 1.\n  std::vector<int> num_neighbors(N * N, 0);\n  MPI_Allreduce(MPI_IN_PLACE, num_neighbors.data(), num_neighbors.size(), MPI_INT, MPI_SUM, comm);\n\n  // Set the output to 1 if there is exactly one neighbor that is 1.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      output[i * N + j] = (num_neighbors[i * N + j] == 1);\n    }\n  }\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes its local part of the output.\n  for (size_t i = my_rank; i < N; i += num_procs) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      std::vector<int> recv_buf(N * N);\n      MPI_Recv(recv_buf.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] |= recv_buf[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Each rank computes its subset of the output.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks onto rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\n__global__ void cellsXOR(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0 && input[y * N + x - 1]) num_ones++;\n    if (x < N - 1 && input[y * N + x + 1]) num_ones++;\n    if (y > 0 && input[(y - 1) * N + x]) num_ones++;\n    if (y < N - 1 && input[(y + 1) * N + x]) num_ones++;\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  if (x > 0) {\n    sum += input[y * N + x - 1];\n  }\n  if (x < N - 1) {\n    sum += input[y * N + x + 1];\n  }\n  if (y > 0) {\n    sum += input[(y - 1) * N + x];\n  }\n  if (y < N - 1) {\n    sum += input[(y + 1) * N + x];\n  }\n  output[y * N + x] = (sum == 1);\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y * N) + (x - 1)];\n    if (x < N - 1) sum += input[(y * N) + (x + 1)];\n    if (y > 0) sum += input[((y - 1) * N) + x];\n    if (y < N - 1) sum += input[((y + 1) * N) + x];\n    output[(y * N) + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int sum = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          sum += input[ii * N + jj];\n        }\n      }\n    }\n    output[i * N + j] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[(y - 1) * N + x - 1];\n    if (x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    if (y > 0 && x > 0) sum += input[(y - 1) * N + x - 1];\n    if (y > 0 && x < N - 1) sum += input[(y - 1) * N + x + 1];\n    if (y < N - 1 && x > 0) sum += input[(y + 1) * N + x - 1];\n    if (y < N - 1 && x < N - 1) sum += input[(y + 1) * N + x + 1];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          sum += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int num_ones = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          num_ones += input[r * N + c];\n        }\n      }\n    }\n    output[row * N + col] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1] == 1) count++;\n    if (x < N - 1 && input[y * N + x + 1] == 1) count++;\n    if (y > 0 && input[(y - 1) * N + x] == 1) count++;\n    if (y < N - 1 && input[(y + 1) * N + x] == 1) count++;\n    output[y * N + x] = (count == 1) ? 1 : 0;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += input[ny * N + nx];\n        }\n      }\n    }\n    output[y * N + x] = (sum == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_ones = 0;\n    if (x > 0) num_ones += input[y * N + x - 1];\n    if (y > 0) num_ones += input[(y - 1) * N + x];\n    if (x < N - 1) num_ones += input[y * N + x + 1];\n    if (y < N - 1) num_ones += input[(y + 1) * N + x];\n    output[y * N + x] = (num_ones == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0 && input[y * N + x - 1]) count++;\n    if (x < N - 1 && input[y * N + x + 1]) count++;\n    if (y > 0 && input[(y - 1) * N + x]) count++;\n    if (y < N - 1 && input[(y + 1) * N + x]) count++;\n    output[y * N + x] = (count == 1);\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int count = 0;\n    if (i > 0) count += input[i - 1 + j * N];\n    if (i < N - 1) count += input[i + 1 + j * N];\n    if (j > 0) count += input[i + (j - 1) * N];\n    if (j < N - 1) count += input[i + (j + 1) * N];\n    output[i + j * N] = (count == 1);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    if (x > 0) count += input[y * N + x - 1];\n    if (x < N - 1) count += input[y * N + x + 1];\n    if (y > 0) count += input[(y - 1) * N + x];\n    if (y < N - 1) count += input[(y + 1) * N + x];\n    output[y * N + x] = (count == 1);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "50_stencil_xor_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n   Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n   input and output are NxN grids of ints in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[0, 1, 1, 0],\n           [1, 0, 0, 0],\n           [0, 0, 0, 0],\n           [0, 1, 0, 0]\n   output: [[0, 0, 1, 1],\n            [1, 0, 0, 1],\n            [0, 0, 1, 0],\n            [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    size_t start_row = rank * rows_per_rank;\n    size_t end_row = std::min(start_row + rows_per_rank, N);\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output.\n  size_t start = rank * N / size;\n  size_t end = (rank + 1) * N / size;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      // Check the neighbors of input_{i,j}.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Gather the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start = i * N / size;\n      size_t end = (i + 1) * N / size;\n      for (size_t j = start; j < end; ++j) {\n        for (size_t k = 0; k < N; ++k) {\n          output[j * N + k] += output[i * N * N + j * N + k];\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int startRow = rank * nRows;\n  int endRow = startRow + nRows;\n  if (rank == size - 1) {\n    endRow = N;\n  }\n  int nCols = N;\n  int nNeighbors = 0;\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      nNeighbors = 0;\n      if (i - 1 >= 0) {\n        nNeighbors += input[(i - 1) * N + j];\n      }\n      if (i + 1 < N) {\n        nNeighbors += input[(i + 1) * N + j];\n      }\n      if (j - 1 >= 0) {\n        nNeighbors += input[i * N + j - 1];\n      }\n      if (j + 1 < N) {\n        nNeighbors += input[i * N + j + 1];\n      }\n      if (nNeighbors == 1) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int nRowsRecv = N / size;\n      int startRowRecv = i * nRowsRecv;\n      int endRowRecv = startRowRecv + nRowsRecv;\n      if (i == size - 1) {\n        endRowRecv = N;\n      }\n      for (int j = startRowRecv; j < endRowRecv; j++) {\n        MPI_Recv(&output[j * N], N * sizeof(int), MPI_INT, i, 0, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&output[startRow * N], (endRow - startRow) * N * sizeof(int), MPI_INT, 0, 0, MPI_STATUS_IGNORE);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_ones = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_ones += input[ii * N + jj];\n          }\n        }\n      }\n      output[i * N + j] = (num_ones == 1);\n    }\n  }\n\n  // Rank 0 gathers all the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nThreads = omp_get_max_threads();\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, startRow + nRowsPerRank);\n  std::vector<int> localOutput(N * nRowsPerRank);\n  #pragma omp parallel num_threads(nThreads)\n  {\n    #pragma omp for\n    for (int i = startRow; i < endRow; ++i) {\n      for (int j = 0; j < N; ++j) {\n        int numNeighbors = 0;\n        if (i > 0) numNeighbors += input[i - 1][j];\n        if (i < N - 1) numNeighbors += input[i + 1][j];\n        if (j > 0) numNeighbors += input[i][j - 1];\n        if (j < N - 1) numNeighbors += input[i][j + 1];\n        localOutput[i * N + j] = (numNeighbors == 1);\n      }\n    }\n  }\n  MPI_Gather(&localOutput[0], N * nRowsPerRank, MPI_INT, &output[0], N * nRowsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of rows in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check neighbors\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (my_rank == 0) {\n    for (int i = 1; i < num_procs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i * N + j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      std::vector<int> recv(N * N);\n      MPI_Recv(recv.data(), N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      for (size_t j = 0; j < N * N; j++) {\n        output[j] += recv[j];\n      }\n    }\n  } else {\n    MPI_Send(output.data(), N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output in parallel\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes the XOR of its own rows\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += nprocs) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different part of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[(i - 1) * N + j];\n      if (i < N - 1) num_neighbors += input[(i + 1) * N + j];\n      if (j > 0) num_neighbors += input[i * N + j - 1];\n      if (j < N - 1) num_neighbors += input[i * N + j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = (N + num_ranks - 1) / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) {\n        num_neighbors += input[i - 1][j];\n      }\n      if (i < N - 1) {\n        num_neighbors += input[i + 1][j];\n      }\n      if (j > 0) {\n        num_neighbors += input[i][j - 1];\n      }\n      if (j < N - 1) {\n        num_neighbors += input[i][j + 1];\n      }\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (my_rank == 0) {\n    for (int i = 1; i < num_ranks; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows of output\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int my_rank, num_ranks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);\n\n  // Each rank computes a subset of the rows of output.\n  int rows_per_rank = N / num_ranks;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of output.\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0 && input[i - 1][j]) ++num_neighbors;\n      if (i < N - 1 && input[i + 1][j]) ++num_neighbors;\n      if (j > 0 && input[i][j - 1]) ++num_neighbors;\n      if (j < N - 1 && input[i][j + 1]) ++num_neighbors;\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks.\n  if (my_rank == 0) {\n    for (int rank = 1; rank < num_ranks; ++rank) {\n      int rows_per_rank = N / num_ranks;\n      int start_row = rank * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row][0], rows_per_rank * N, MPI_INT, rank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    // Other ranks send their results to rank 0.\n    MPI_Send(&output[start_row][0], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its local part of the output.\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += input[i - 1][j];\n      if (i < N - 1) count += input[i + 1][j];\n      if (j > 0) count += input[i][j - 1];\n      if (j < N - 1) count += input[i][j + 1];\n      output[i][j] = (count == 1);\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Each rank computes its subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of the rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all the other ranks.\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(N, (i + 1) * rows_per_rank);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  std::vector<int> local_input(input.begin() + start_row * N, input.begin() + end_row * N);\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < rows_per_rank; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      if (i > 0) count += local_input[(i - 1) * N + j];\n      if (i < rows_per_rank - 1) count += local_input[(i + 1) * N + j];\n      if (j > 0) count += local_input[i * N + j - 1];\n      if (j < N - 1) count += local_input[i * N + j + 1];\n      local_output[i * N + j] = (count == 1);\n    }\n  }\n\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows in output.\n  size_t rows_per_rank = (N + num_procs - 1) / num_procs;\n  size_t start_row = my_rank * rows_per_rank;\n  size_t end_row = std::min(N, (my_rank + 1) * rows_per_rank);\n\n  // Each rank computes its subset of rows in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}.\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n              output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // TODO: Implement cellsXOR\n  int my_rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);\n  int num_procs;\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Each rank computes a subset of the rows of output\n  int rows_per_rank = N / num_procs;\n  int start_row = my_rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes its subset of the rows of output in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of input_{i,j}\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n\n      // Set output_{i,j} to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n      output[i * N + j] = (num_neighbors == 1);\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of output.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of output in parallel.\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      if (i > 0) num_neighbors += input[i - 1][j];\n      if (i < N - 1) num_neighbors += input[i + 1][j];\n      if (j > 0) num_neighbors += input[i][j - 1];\n      if (j < N - 1) num_neighbors += input[i][j + 1];\n      output[i][j] = (num_neighbors == 1);\n    }\n  }\n\n  // Rank 0 gathers the results from all ranks.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos is already initialized.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(Kokkos::View<const int**> &input, Kokkos::View<int**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) continue;\n          num_neighbors += input(ii, jj);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { count += input(ii, jj); }\n      }\n    }\n    count -= input(i, j);\n    if (input(i, j) == 1) {\n      output(i, j) = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output(i, j) = (count == 3) ? 1 : 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int numNeighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          numNeighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (numNeighbors < 2 || numNeighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (numNeighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          count += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          num_neighbors += input(i + ii, j + jj);\n        }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          count += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (count == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && !(ii == i && jj == j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (ii == 0 && jj == 0) continue;\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) { num_neighbors += input(x, y); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) output(i, j) = 0;\n      else output(i, j) = 1;\n    } else {\n      if (num_neighbors == 3) output(i, j) = 1;\n      else output(i, j) = 0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n          num_neighbors += input(ii, jj);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ++ii) {\n      for (int jj = j - 1; jj <= j + 1; ++jj) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) { num_neighbors += input(ii, jj); }\n      }\n    }\n    num_neighbors -= input(i, j);\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (num_neighbors == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          num_neighbors += input(x, y);\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3)\n          output(i, j) = 0;\n        else\n          output(i, j) = 1;\n      } else {\n        if (num_neighbors == 3) output(i, j) = 1;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int ii = -1; ii <= 1; ii++) {\n      for (int jj = -1; jj <= 1; jj++) {\n        if (ii == 0 && jj == 0) {\n          continue;\n        }\n        int x = i + ii;\n        int y = j + jj;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          num_neighbors += input(x, y);\n        }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int num_neighbors = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int x = i + dx;\n        int y = j + dy;\n        if (x < 0 || x >= N || y < 0 || y >= N) continue;\n        num_neighbors += input(x, y);\n      }\n    }\n    if (input(i, j) == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output(i, j) = 1;\n      } else {\n        output(i, j) = 0;\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(const size_t &j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input(x, y);\n          }\n        }\n      }\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t i) {\n    Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA (const size_t j) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            num_neighbors += input(i + ii, j + jj);\n          }\n        }\n      }\n      num_neighbors -= input(i, j);\n      if (input(i, j) == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output(i, j) = 0;\n        } else {\n          output(i, j) = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output(i, j) = 1;\n        } else {\n          output(i, j) = 0;\n        }\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"gameOfLife\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int count = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) { count += input(ii, jj); }\n      }\n    }\n    if (input(i, j) == 1) {\n      if (count < 2 || count > 3) { output(i, j) = 0; } else { output(i, j) = 1; }\n    } else {\n      if (count == 3) { output(i, j) = 1; } else { output(i, j) = 0; }\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          count += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            count += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      count -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (int)i + di;\n          int jj = (int)j + dj;\n          if (ii < 0 || ii >= (int)N || jj < 0 || jj >= (int)N) continue;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3;\n      } else {\n        output[i * N + j] = num_neighbors == 3;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) {\n            continue;\n          }\n          int x = (i + ii + N) % N;\n          int y = (j + jj + N) % N;\n          neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = (i + dx + N) % N;\n          int y = (j + dy + N) % N;\n          num_neighbors += input[x * N + y];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "serial",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1117396398,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2652955898
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni < 0 || ni >= N || nj < 0 || nj >= N) {\n            continue;\n          }\n          num_neighbors += input[ni * N + nj];\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; dx++) {\n        for (int dy = -1; dy <= 1; dy++) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1189290728,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2664834996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int x = -1; x <= 1; ++x) {\n        for (int y = -1; y <= 1; ++y) {\n          if (x == 0 && y == 0) continue;\n          int nx = i + x;\n          int ny = j + y;\n          if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n            num_neighbors += input[nx * N + ny];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1106602324,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2584191796
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.110228672,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2593527164
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = num_neighbors == 2 || num_neighbors == 3 ? 1 : 0;\n      } else {\n        output[i * N + j] = num_neighbors == 3 ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1100217246,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2425157418
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1098059144,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2577138244
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1109583716,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2594657614
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int dx = -1; dx <= 1; ++dx) {\n        for (int dy = -1; dy <= 1; ++dy) {\n          if (dx == 0 && dy == 0) continue;\n          int x = i + dx;\n          int y = j + dy;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1109376262,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2631701736
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1102767562,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2586462734
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1122761976,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2834853546
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1098230166,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2574110404
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n            neighbors += input[(i + ii) * N + j + jj];\n          }\n        }\n      }\n      neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (neighbors < 2 || neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1178534506,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0912345254
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1123884772,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2608322996
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      size_t num_neighbors = 0;\n      for (size_t x = i - 1; x <= i + 1; ++x) {\n        for (size_t y = j - 1; y <= j + 1; ++y) {\n          if (x >= 0 && x < N && y >= 0 && y < N && (x != i || y != j)) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1081588084,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.354931741
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.11022316,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2438502098
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1172081182,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2617074404
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1106045856,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2625025892
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int x = i + di;\n          int y = j + dj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            num_neighbors += input[x * N + y];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1122766588,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2671626678
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.1093826178,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2592904676
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "hip",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    int num_neighbors = 0;\n    for (int ii = i - 1; ii <= i + 1; ii++) {\n      for (int jj = j - 1; jj <= j + 1; jj++) {\n        if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n    }\n    num_neighbors -= input[i * N + j];\n    if (input[i * N + j] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[i * N + j] = 0;\n      } else {\n        output[i * N + j] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[i * N + j] = 1;\n      } else {\n        output[i * N + j] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) return;\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    count -= input[y * N + x];\n    if (input[y * N + x] == 1) {\n      output[y * N + x] = (count == 2 || count == 3) ? 1 : 0;\n    } else {\n      output[y * N + x] = (count == 3) ? 1 : 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0)\n          continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3)\n        output[y * N + x] = 0;\n      else\n        output[y * N + x] = 1;\n    } else {\n      if (num_neighbors == 3)\n        output[y * N + x] = 1;\n      else\n        output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = hipThreadIdx().x;\n  int y = hipThreadIdx().y;\n  int neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (neighbors < 2 || neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (count < 2 || count > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (neighbors < 2 || neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int count = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) {\n          continue;\n        }\n        if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n          count += input[(x + i) * N + y + j];\n        }\n      }\n    }\n    if (input[x * N + y] == 1) {\n      if (count < 2 || count > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (count == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_rows = N / size;\n  int n_cols = N;\n  int n_elements = n_rows * n_cols;\n  std::vector<int> local_input(n_elements);\n  std::vector<int> local_output(n_elements);\n  MPI_Scatter(input.data(), n_elements, MPI_INT, local_input.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < n_rows; i++) {\n    for (int j = 0; j < n_cols; j++) {\n      int n_neighbors = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ii_global = i + ii;\n          int jj_global = j + jj;\n          if (ii_global >= 0 && ii_global < n_rows && jj_global >= 0 && jj_global < n_cols) {\n            n_neighbors += local_input[ii_global * n_cols + jj_global];\n          }\n        }\n      }\n      if (local_input[i * n_cols + j] == 1) {\n        if (n_neighbors < 2 || n_neighbors > 3) {\n          local_output[i * n_cols + j] = 0;\n        } else {\n          local_output[i * n_cols + j] = 1;\n        }\n      } else {\n        if (n_neighbors == 3) {\n          local_output[i * n_cols + j] = 1;\n        } else {\n          local_output[i * n_cols + j] = 0;\n        }\n      }\n    }\n  }\n  MPI_Gather(local_output.data(), n_elements, MPI_INT, output.data(), n_elements, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Count the number of live neighbors for each cell.\n  std::vector<int> counts(N * N, 0);\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            counts[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Update the output based on the counts.\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int count = counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (count == 2 || count == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (count == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  if (MPI::COMM_WORLD.Get_rank() == 0) {\n    for (int i = 1; i < MPI::COMM_WORLD.Get_size(); ++i) {\n      std::vector<int> recv_buf(N * N);\n      MPI::COMM_WORLD.Recv(recv_buf.data(), N * N, MPI::INT, i, 0);\n      for (size_t j = 0; j < N * N; ++j) {\n        output[j] += recv_buf[j];\n      }\n    }\n  } else {\n    MPI::COMM_WORLD.Send(output.data(), N * N, MPI::INT, 0, 0);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks.\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = static_cast<int>(i) + di;\n          int jj = static_cast<int>(j) + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the start and end rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(N * rows_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the next generation for its subset of rows\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N && input[ni * N + nj]) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start_row * N, rows_per_rank * N, MPI_INT,\n             output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  std::vector<int> local_input(input.begin() + rank * N * N, input.begin() + (rank + 1) * N * N);\n  std::vector<int> local_output(N * N, 0);\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            count += local_input[ii * N + jj];\n          }\n        }\n      }\n      if (local_input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          local_output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output.\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Check the 8 neighbors of the cell.\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life.\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows.\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows.\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      MPI_Recv(&output[start_row * N], rows_per_rank * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n\n      // Check the neighbors of the current cell\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) {\n            continue;\n          }\n\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n\n          // Check if the neighbor is within the grid\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min((int)N - 1, (int)i + 1); ii++) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min((int)N - 1, (int)j + 1); jj++) {\n          if (input[ii * N + jj] == 1) {\n            num_neighbors++;\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbor_counts(N * N, 0);\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (size_t di = -1; di <= 1; di++) {\n        for (size_t dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbor_counts[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the output based on the neighbor counts.\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 2 || neighbor_counts[i * N + j] == 3);\n      } else {\n        output[i * N + j] = (neighbor_counts[i * N + j] == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            local_neighbors[i * N + j] += input[ii * N + jj];\n          }\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3);\n        } else {\n          output[i * N + j] = (neighbors == 3);\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j]) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Each rank computes its subset of the output\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = std::min(start_row + rows_per_rank, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  int rows_per_rank = N / size;\n\n  // Compute the starting and ending rows for this rank\n  int start_row = rank * rows_per_rank;\n  int end_row = (rank + 1) * rows_per_rank;\n\n  // Compute the number of columns each rank is responsible for\n  int cols_per_rank = N / size;\n\n  // Compute the starting and ending columns for this rank\n  int start_col = rank * cols_per_rank;\n  int end_col = (rank + 1) * cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<int> local_output(rows_per_rank * N, 0);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      int num_neighbors = 0;\n      // Check the neighbors of the current cell\n      if (i > 0 && input[i - 1][j] == 1) num_neighbors++;\n      if (i < N - 1 && input[i + 1][j] == 1) num_neighbors++;\n      if (j > 0 && input[i][j - 1] == 1) num_neighbors++;\n      if (j < N - 1 && input[i][j + 1] == 1) num_neighbors++;\n\n      // Apply the rules of the game of life\n      if (input[i][j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          local_output[i * N + j] = 0;\n        } else {\n          local_output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          local_output[i * N + j] = 1;\n        } else {\n          local_output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the local results into the output vector on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_INT, output.data(), rows_per_rank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "cuda",
+        "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\n__global__ void gameOfLife(const int *input, int *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int count = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        count += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (count < 2 || count > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (count == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        num_neighbors += input[ny * N + nx];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = threadIdx.x;\n  int y = threadIdx.y;\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) continue;\n      int neighbor_x = (x + i + N) % N;\n      int neighbor_y = (y + j + N) % N;\n      num_neighbors += input[neighbor_y * N + neighbor_x];\n    }\n  }\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "_",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n            neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n    num_neighbors -= input[x * N + y];\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_y * N + neighbor_x];\n        }\n      }\n    }\n\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int num_neighbors = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      if (i == 0 && j == 0) {\n        continue;\n      }\n      int neighbor_x = x + i;\n      int neighbor_y = y + j;\n      if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 &&\n          neighbor_y < N) {\n        num_neighbors += input[neighbor_y * N + neighbor_x];\n      }\n    }\n  }\n\n  if (input[y * N + x] == 1) {\n    if (num_neighbors < 2 || num_neighbors > 3) {\n      output[y * N + x] = 0;\n    } else {\n      output[y * N + x] = 1;\n    }\n  } else {\n    if (num_neighbors == 3) {\n      output[y * N + x] = 1;\n    } else {\n      output[y * N + x] = 0;\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int neighbor_x = x + i;\n        int neighbor_y = y + j;\n        if (neighbor_x >= 0 && neighbor_x < N && neighbor_y >= 0 && neighbor_y < N) {\n          num_neighbors += input[neighbor_x * N + neighbor_y];\n        }\n      }\n    }\n\n    if (input[x * N + y] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[x * N + y] = 0;\n      } else {\n        output[x * N + y] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[x * N + y] = 1;\n      } else {\n        output[x * N + y] = 0;\n      }\n    }\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int num_neighbors = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        if (i == 0 && j == 0) continue;\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          num_neighbors += input[ny * N + nx];\n        }\n      }\n    }\n    if (input[y * N + x] == 1) {\n      if (num_neighbors < 2 || num_neighbors > 3) {\n        output[y * N + x] = 0;\n      } else {\n        output[y * N + x] = 1;\n      }\n    } else {\n      if (num_neighbors == 3) {\n        output[y * N + x] = 1;\n      } else {\n        output[y * N + x] = 0;\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "54_stencil_game_of_life",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n   A cell is 1 if it is alive and 0 if it is dead.\n   If a live cell has fewer than 2 live neighbors then it dies.\n   If a live cell has 2 or 3 live neighbors then it lives on.\n   If a live cell has more than 3 live neighbords then it dies.\n   If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI is already initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input:  [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 0, 0, 0],\n            [0, 1, 1, 0, 0],\n            [0, 0, 1, 1, 0],\n            [0, 1, 0, 0, 0]]\n   output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t  [0, 1, 1, 0, 0],\n            [0, 1, 0, 1, 0],\n            [0, 0, 0, 1, 0],\n            [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector<int> const& input, std::vector<int> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  // Get the rank of the current process.\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n\n  // Compute the number of rows that each process will handle.\n  int rows_per_process = N / MPI_Comm_size(MPI_COMM_WORLD);\n\n  // Compute the starting and ending rows for the current process.\n  int start_row = rank * rows_per_process;\n  int end_row = start_row + rows_per_process;\n\n  // Compute the number of columns in the input grid.\n  int num_cols = input.size() / N;\n\n  // Create a buffer to store the results for the current process.\n  std::vector<int> buffer(N * num_cols);\n\n  // Iterate over the rows assigned to the current process.\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns in the input grid.\n    for (int j = 0; j < num_cols; ++j) {\n      // Compute the number of live neighbors for the current cell.\n      int num_live_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          // Skip the current cell.\n          if (ii == i && jj == j) {\n            continue;\n          }\n\n          // Check if the neighbor cell is alive.\n          if (input[ii * num_cols + jj] == 1) {\n            ++num_live_neighbors;\n          }\n        }\n      }\n\n      // Apply the rules of the Game of Life to the current cell.\n      if (input[i * num_cols + j] == 1) {\n        if (num_live_neighbors < 2 || num_live_neighbors > 3) {\n          buffer[i * num_cols + j] = 0;\n        } else {\n          buffer[i * num_cols + j] = 1;\n        }\n      } else {\n        if (num_live_neighbors == 3) {\n          buffer[i * num_cols + j] = 1;\n        } else {\n          buffer[i * num_cols + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes.\n  MPI_Gather(buffer.data(), N * num_cols, MPI_INT, output.data(), N * num_cols, MPI_INT, 0);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = (N + size - 1) / size;\n  int start = rank * local_size;\n  int end = std::min(start + local_size, N);\n\n  std::vector<int> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<int> local_output(local_input.size());\n\n  #pragma omp parallel for\n  for (int i = 0; i < local_input.size(); i++) {\n    int x = i / N;\n    int y = i % N;\n    int count = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (dx == 0 && dy == 0) continue;\n        int nx = x + dx;\n        int ny = y + dy;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          count += local_input[nx * N + ny];\n        }\n      }\n    }\n    if (local_input[i] == 1) {\n      if (count < 2 || count > 3) {\n        local_output[i] = 0;\n      } else {\n        local_output[i] = 1;\n      }\n    } else {\n      if (count == 3) {\n        local_output[i] = 1;\n      } else {\n        local_output[i] = 0;\n      }\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_output.size(), MPI_INT, output.data(), local_output.size(), MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of the rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3) ? 1 : 0;\n      } else {\n        output[i * N + j] = (num_neighbors == 3) ? 1 : 0;\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own part of the output\n  #pragma omp parallel for\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N && input[x * N + y]) count++;\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j]) {\n        if (count < 2 || count > 3) output[i * N + j] = 0;\n        else output[i * N + j] = 1;\n      } else {\n        if (count == 3) output[i * N + j] = 1;\n        else output[i * N + j] = 0;\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + nprocs - 1) / nprocs;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min((rank + 1) * chunk_size, N);\n\n  // Compute the chunk of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (size_t ii = std::max(0, (int)i - 1); ii <= std::min(N - 1, (int)i + 1); ++ii) {\n        for (size_t jj = std::max(0, (int)j - 1); jj <= std::min(N - 1, (int)j + 1); ++jj) {\n          num_neighbors += input[ii * N + jj];\n        }\n      }\n      num_neighbors -= input[i * N + j];\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min((i + 1) * chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Compute the number of live neighbors for each cell.\n  std::vector<int> neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Update the state of each cell.\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      if (input[i * N + j] == 1) {\n        if (neighbors[i * N + j] < 2 || neighbors[i * N + j] > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (neighbors[i * N + j] == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0.\n  MPI_Gather(output.data(), N * N, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  int nprocs;\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rows_per_rank = N / nprocs;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Compute the next generation for the subset of rows\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ++ii) {\n        for (int jj = j - 1; jj <= j + 1; ++jj) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int num_neighbors = 0;\n      // Count the number of live neighbors\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            num_neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Initialize output to 0\n  output.assign(N * N, 0);\n\n  // Get rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Divide the grid among the processes\n  int rows_per_proc = N / nprocs;\n  int start_row = rank * rows_per_proc;\n  int end_row = start_row + rows_per_proc;\n\n  // Compute the next generation for each row in parallel\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      // Count the number of live neighbors\n      int num_neighbors = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          if (k == 0 && l == 0) continue;\n          int neighbor_row = i + k;\n          int neighbor_col = j + l;\n          if (neighbor_row >= 0 && neighbor_row < N && neighbor_col >= 0 && neighbor_col < N) {\n            num_neighbors += input[neighbor_row * N + neighbor_col];\n          }\n        }\n      }\n\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          output[i * N + j] = 1;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start_row * N], rows_per_proc * N, MPI_INT,\n              &output[0], rows_per_proc * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its own output\n  #pragma omp parallel for\n  for (int i = 0; i < N; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int count = 0;\n      // Count the number of live neighbors\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          if (ii == 0 && jj == 0) continue;\n          int x = i + ii;\n          int y = j + jj;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            count += input[x * N + y];\n          }\n        }\n      }\n      // Apply the rules of the game of life\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks to rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its subgrid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      for (int di = -1; di <= 1; di++) {\n        for (int dj = -1; dj <= 1; dj++) {\n          if (di == 0 && dj == 0) continue;\n          int ii = (i + di + N) % N;\n          int jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its subgrid's neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; i++) {\n      for (size_t j = 0; j < N; j++) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          output[i * N + j] = (neighbors == 2 || neighbors == 3) ? 1 : 0;\n        } else {\n          output[i * N + j] = (neighbors == 3) ? 1 : 0;\n        }\n      }\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n\n  std::vector<int> localInput(nRows * N);\n  std::vector<int> localOutput(nRows * N);\n\n  MPI_Scatter(input.data(), nRows * N, MPI_INT, localInput.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          if (ii == 0 && jj == 0) continue;\n          int ni = i + ii;\n          int nj = j + jj;\n          if (ni >= 0 && ni < nRows && nj >= 0 && nj < N) {\n            count += localInput[ni * N + nj];\n          }\n        }\n      }\n      if (localInput[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          localOutput[i * N + j] = 0;\n        } else {\n          localOutput[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          localOutput[i * N + j] = 1;\n        } else {\n          localOutput[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  MPI_Gather(localOutput.data(), nRows * N, MPI_INT, output.data(), nRows * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  // Each rank computes the number of live neighbors for each cell in its subset of rows\n  std::vector<int> neighbor_counts(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      // Count the number of live neighbors for cell (i, j)\n      int count = 0;\n      for (int ii = i - 1; ii <= i + 1; ii++) {\n        for (int jj = j - 1; jj <= j + 1; jj++) {\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N && (ii != i || jj != j)) {\n            count += input[ii * N + jj];\n          }\n        }\n      }\n      neighbor_counts[i * N + j] = count;\n    }\n  }\n\n  // Each rank updates its subset of rows of the output based on the neighbor counts\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      int count = neighbor_counts[i * N + j];\n      if (input[i * N + j] == 1) {\n        if (count < 2 || count > 3) {\n          output[i * N + j] = 0;\n        } else {\n          output[i * N + j] = 1;\n        }\n      } else {\n        if (count == 3) {\n          output[i * N + j] = 1;\n        } else {\n          output[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the next generation for the subset of rows assigned to this rank\n  std::vector<int> next_gen(N * N);\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = static_cast<int>(i) + di;\n          int neighbor_j = static_cast<int>(j) + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        next_gen[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        next_gen[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results from all ranks into the output on rank 0\n  MPI_Gather(next_gen.data(), N * rows_per_rank, MPI_INT,\n              output.data(), N * rows_per_rank, MPI_INT,\n              0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its portion of the grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Count the number of live neighbors for cell (i, j).\n      int neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ii = i + di;\n          int jj = j + dj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            neighbors += input[ii * N + jj];\n          }\n        }\n      }\n      local_neighbors[i * N + j] = neighbors;\n    }\n  }\n\n  // Each rank sends its portion of the neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N);\n  MPI_Gather(local_neighbors.data(), N * N, MPI_INT, global_neighbors.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell based on the number of live neighbors.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "_t",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a chunk of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  // Compute the next generation for the assigned rows\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int ni = i + di;\n          int nj = j + dj;\n          if (ni >= 0 && ni < N && nj >= 0 && nj < N) {\n            num_neighbors += input[ni * N + nj];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        output[i * N + j] = (num_neighbors == 2 || num_neighbors == 3);\n      } else {\n        output[i * N + j] = (num_neighbors == 3);\n      }\n    }\n  }\n\n  // Gather the results on rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the next generation for a subset of rows\n  int rows_per_rank = (N + size - 1) / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the next generation for the subset of rows\n  std::vector<int> next_gen(N * N, 0);\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int num_neighbors = 0;\n      for (int di = -1; di <= 1; ++di) {\n        for (int dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          int neighbor_i = i + di;\n          int neighbor_j = j + dj;\n          if (neighbor_i >= 0 && neighbor_i < N && neighbor_j >= 0 && neighbor_j < N) {\n            num_neighbors += input[neighbor_i * N + neighbor_j];\n          }\n        }\n      }\n      if (input[i * N + j] == 1) {\n        if (num_neighbors < 2 || num_neighbors > 3) {\n          next_gen[i * N + j] = 0;\n        } else {\n          next_gen[i * N + j] = 1;\n        }\n      } else {\n        if (num_neighbors == 3) {\n          next_gen[i * N + j] = 1;\n        } else {\n          next_gen[i * N + j] = 0;\n        }\n      }\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(next_gen.data(), N * N, MPI_INT, output.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes the number of live neighbors for each cell in its local grid.\n  std::vector<int> local_neighbors(N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      for (size_t di = -1; di <= 1; ++di) {\n        for (size_t dj = -1; dj <= 1; ++dj) {\n          if (di == 0 && dj == 0) continue;\n          size_t ii = (i + di + N) % N;\n          size_t jj = (j + dj + N) % N;\n          local_neighbors[i * N + j] += input[ii * N + jj];\n        }\n      }\n    }\n  }\n\n  // Each rank sends its local neighbor counts to rank 0.\n  std::vector<int> global_neighbors(N * N, 0);\n  MPI_Reduce(local_neighbors.data(), global_neighbors.data(), N * N, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);\n\n  // Rank 0 computes the new state of each cell.\n  if (rank == 0) {\n    for (size_t i = 0; i < N; ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int neighbors = global_neighbors[i * N + j];\n        if (input[i * N + j] == 1) {\n          if (neighbors < 2 || neighbors > 3) {\n            output[i * N + j] = 0;\n          } else {\n            output[i * N + j] = 1;\n          }\n        } else {\n          if (neighbors == 3) {\n            output[i * N + j] = 1;\n          } else {\n            output[i * N + j] = 0;\n          }\n        }\n      }\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(Kokkos::View<const double*> &input, Kokkos::View<double*> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  KokConfigService::instance().set_num_teams(1);\n  KokConfigService::instance().set_num_nodes(1);\n  KokConfigService::instance().set_num_ranks_per_node(1);\n  KokConfigService::instance().set_num_devices_per_rank(1);\n  KokConfigService::instance().set_device_type(\"CPU\");\n\n  Kok\u30b3\u30af::TeamPolicy policy(Kok\u30b3\u30af::TeamThreadLayout::RowMajor);\n  Kok\u30b3\u30af::Team team(policy);\n\n  team.execute_task([&](Kok\u30b3\u30af::TeamMember rank) {\n    const int num_elements = input.extent(0);\n    for (int i = 1; i < num_elements - 1; ++i) {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  using Kokkos::parallel_for;\n  using Kokkos::RangePolicy;\n  using Kokkos::TeamPolicy;\n  using Kokkos::WorkSpace;\n\n  const int width = input.extent(0);\n  const int height = input.extent(1);\n\n  Kokkos::TeamPolicy<Kokkos::DefaultExecutionSpace> policy(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, width * height));\n  Kokkos::WorkSpace ws;\n\n  parallel_for(policy, KOKKOS_LAMBDA(const int i) {\n    const int x = i % width;\n    const int y = i / width;\n\n    if (x == 0 || x == width - 1 || y == 0 || y == height - 1) {\n      output(x, y) = input(x, y);\n    } else {\n      output(x, y) = (input(x - 1, y) + input(x + 1, y) + input(x, y - 1) + input(x, y + 1)) / 4.0;\n    }\n  }, ws);\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kok c\u00f3c::parallel_for(\"jacobi1D\", 0, output.extent(0), KOK c\u00f3c::RangePolicy<Kok c\u00f3c::IndexType<int64_t>>(KOK c\u00f3c::get_num_threads()),\n                       KOK c\u00f3c::KOK c\u00f3c_LAMBDA(int64_t i) {\n                         if (i == 0) {\n                           output[i] = input[i] / 3;\n                         } else if (i == output.extent(0) - 1) {\n                           output[i] = input[i] / 3;\n                         } else {\n                           output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n                         }\n                       });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == N - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const auto N = input.extent(0);\n  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = 0;\n    } else if (i == N - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", input.size(), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  });\n}\n",
+            "\n\n  const auto exec_space = Kokkos::DefaultExecutionSpace();\n  const auto exec_policy = Kokkos::RangePolicy<exec_space>(0, input.extent(0));\n  Kokkos::parallel_for(exec_policy, KOKKOS_LAMBDA(const int &i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    double left = i > 0 ? input(i - 1) : 0;\n    double center = input(i);\n    double right = i < input.extent(0) - 1 ? input(i + 1) : 0;\n    output(i) = (left + center + right) / 3;\n  });\n}\n",
+            "Kokkoss vector space.",
+            "\n\n  Kokkos::parallel_for(input.extent(0), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(num_elements, KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == num_elements - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  const int num_elements = input.extent(0);\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::Range(0, num_elements), [=](int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2;\n    } else if (i == num_elements - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi1D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, input.extent(0)), KOKKOS_LAMBDA(const int i) {\n    if (i == 0) {\n      output(i) = (input(i) + input(i + 1)) / 2.0;\n    } else if (i == input.extent(0) - 1) {\n      output(i) = (input(i - 1) + input(i)) / 2.0;\n    } else {\n      output(i) = (input(i - 1) + input(i) + input(i + 1)) / 3.0;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[0] = (input[0] + input[1]) / 2;\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma  for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < static_cast<int>(input.size()) - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (int i = 0; i < input.size(); i++) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i + 1 < input.size() ? input[i + 1] : 0);\n    output[i] = output[i] / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0277388652,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0295456638
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.027662886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291811874
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0277429608,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.029202137
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.027771857,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0185472976
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  output[0] = (input[0] + input[1]) / 2;\n  for (int i = 1; i < input.size() - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n  output[input.size() - 1] = (input[input.size() - 2] + input[input.size() - 1]) / 2;\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0278630688,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0187170514
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < input.size() - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0276472142,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0293229466
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0277304114,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291699444
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); ++i) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0277539434,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283480334
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0276487954,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291791638
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0275739904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283950642
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.027781471,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282949016
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0278739452,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0285012236
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0278768228,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.028369985
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0276162904,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283624608
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < input.size() - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0276908786,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0291832312
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.027586955,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282679064
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0276236824,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0283219786
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 0; i < input.size(); i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < input.size() - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0277417966,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0290872348
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < input.size(); i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i] + input[i - 1]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0275909726,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0282657928
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 1; i < input.size() - 1; ++i) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0279498264,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0184250156
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use AMD HIP to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int idx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (idx < N) {\n    output[idx] = (idx > 0 ? input[idx - 1] : 0) + input[idx] + (idx < N - 1 ? input[idx + 1] : 0);\n    output[idx] /= 3;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int tid = hipGetGlobalThreadIdx().x;\n  if (tid < N) {\n    output[tid] = (input[tid - 1] + input[tid] + input[tid + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = hipThreadIdx_x;\n  int bid = hipBlockIdx_x;\n  int i = tid + bid * hipBlockDim_x;\n\n  if (i < N) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double center = input[i];\n    double right = i < N - 1 ? input[i + 1] : 0;\n    output[i] = (left + center + right) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] +\n                (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3.0;\n  }\n}\n",
+            "\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] +\n                  (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i == 0) {\n    output[i] = (input[i] + input[i + 1]) / 2.0;\n  } else if (i == N - 1) {\n    output[i] = (input[i - 1] + input[i]) / 2.0;\n  } else {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n}\n",
+            "\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = hipThreadIdx_x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_output(local_n);\n\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * local_size], local_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  MPI_Datatype vec_type;\n  MPI_Type_vector(n, 1, 1, MPI_DOUBLE, &vec_type);\n  MPI_Type_commit(&vec_type);\n\n  MPI_Allgather(&input[0], 1, vec_type, &output[0], 1, vec_type, MPI_COMM_WORLD);\n\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (output[i - 1] + output[i] + output[i + 1]) / 3;\n  }\n\n  MPI_Type_free(&vec_type);\n}\n",
+            "\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int remainder = input.size() % size;\n\n  std::vector<double> local_input(local_size + (rank < remainder ? 1 : 0));\n  std::vector<double> local_output(local_size);\n\n  MPI_Scatter(input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, local_input.data(), local_size + (rank < remainder ? 1 : 0), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  for (int i = 0; i < local_size; i++) {\n    local_output[i] = (local_input[std::max(0, i - 1)] + local_input[std::min(i + 1, local_size - 1)] + local_input[i]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * chunk], chunk, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  for (int i = start; i < end; i++) {\n    double left = (i > 0) ? input[i - 1] : 0;\n    double right = (i < n - 1) ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * local_n;\n      int end = start + local_n;\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], local_n, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = (n + size - 1) / size;\n  int start = rank * chunk_size;\n  int end = std::min(start + chunk_size, n);\n\n  for (int i = start; i < end; i++) {\n    double left = i > 0 ? input[i - 1] : 0;\n    double right = i < n - 1 ? input[i + 1] : 0;\n    output[i] = (left + input[i] + right) / 3;\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start = i * chunk_size;\n      int end = std::min(start + chunk_size, n);\n      MPI_Recv(&output[start], end - start, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start], end - start, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_size = n / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  output.resize(n);\n\n  // Compute the local part of the output\n  for (int i = 1; i < n - 1; i++) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  }\n\n  // Send the local part of the output to rank 0\n  MPI_Gather(&output[1], n - 2, MPI_DOUBLE, &output[1], n - 2, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Replace the local part of the output with 0 on all ranks except rank 0\n  if (rank != 0) {\n    for (int i = 1; i < n - 1; i++) {\n      output[i] = 0;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; ++i) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  local_output[0] = (local_input[0] + local_input[1]) / 2;\n  local_output[local_n - 1] = (local_input[local_n - 2] + local_input[local_n - 1]) / 2;\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use CUDA to compute in parallel. The kernel is launched with at least N threads.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\n__global__ void jacobi1D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n  } else {\n    output[i] = 0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < N) {\n    output[tid] = (tid > 0 ? input[tid - 1] : 0) + input[tid] + (tid < N - 1 ? input[tid + 1] : 0);\n    output[tid] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == N - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < N) {\n    output[i] = (i > 0 ? input[i - 1] : 0) + input[i] + (i < N - 1 ? input[i + 1] : 0);\n    output[i] /= 3;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "52_stencil_1d_jacobi_3-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n   i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n   Replace with 0 when reading past the boundaries of `input`.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [9, -6, -1, 2, 3]\n   output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector<double> const& input, std::vector<double> &output) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_size - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3.0;\n  }\n  MPI_Gather(&local_output[0], local_n, MPI_DOUBLE, &output[0], local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n  if (rank == size - 1) {\n    end = n;\n  }\n\n  std::vector<double> local_output(chunk_size);\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      local_output[i] = (input[i] + input[i + 1]) / 3;\n    } else if (i == n - 1) {\n      local_output[i] = (input[i - 1] + input[i]) / 3;\n    } else {\n      local_output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(local_output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = (rank + 1) * local_size;\n\n  #pragma parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == input.size() - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int chunk_size = n / size;\n  int start = rank * chunk_size;\n  int end = start + chunk_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), chunk_size, MPI_DOUBLE, output.data(), chunk_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of elements per rank\n  int elements_per_rank = input.size() / size;\n\n  // Compute the start and end indices for this rank\n  int start_index = rank * elements_per_rank;\n  int end_index = (rank + 1) * elements_per_rank;\n\n  // Compute the local output\n  std::vector<double> local_output(elements_per_rank);\n  #pragma omp parallel for\n  for (int i = 0; i < elements_per_rank; i++) {\n    int global_index = start_index + i;\n    if (global_index == 0 || global_index == input.size() - 1) {\n      local_output[i] = 0;\n    } else {\n      local_output[i] = (input[global_index - 1] + input[global_index] + input[global_index + 1]) / 3;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(input.size());\n    for (int i = 0; i < size; i++) {\n      MPI_Recv(&output[i * elements_per_rank], elements_per_rank, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], elements_per_rank, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (input[start + i - 1] + input[start + i] + input[start + i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int local_size = input.size() / size;\n  int start = rank * local_size;\n  int end = start + local_size;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == input.size() - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_size, MPI_DOUBLE, output.data(), local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = (rank + 1) * local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2.0;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2.0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3.0;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  // Compute the local part of the output\n  #pragma omp parallel for\n  for (int i = start; i < end; ++i) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n = input.size();\n  int chunk = n / size;\n  int start = rank * chunk;\n  int end = (rank + 1) * chunk;\n  if (rank == size - 1) {\n    end = n;\n  }\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0) {\n      output[i] = (input[i] + input[i + 1]) / 2;\n    } else if (i == n - 1) {\n      output[i] = (input[i - 1] + input[i]) / 2;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n  MPI_Gather(output.data() + start, chunk, MPI_DOUBLE, output.data(), chunk, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = (n + size - 1) / size;\n  int start = rank * local_n;\n  int end = std::min(start + local_n, n);\n\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_n);\n\n  #pragma omp parallel for\n  for (int i = 1; i < local_n - 1; i++) {\n    local_output[i] = (local_input[i - 1] + local_input[i] + local_input[i + 1]) / 3;\n  }\n\n  MPI_Gather(local_output.data(), local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int n = input.size();\n  int local_n = n / size;\n  int start = rank * local_n;\n  int end = start + local_n;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    if (i == 0 || i == n - 1) {\n      output[i] = 0;\n    } else {\n      output[i] = (input[i - 1] + input[i] + input[i + 1]) / 3;\n    }\n  }\n\n  MPI_Gather(output.data() + start, local_n, MPI_DOUBLE, output.data(), local_n, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images.\n   Store the output of the computation in imageOut.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(Kokkos::View<const int**> &imageIn, Kokkos::View<int**> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(N, N), [=](int i, int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int ii = i + k;\n        int jj = j + l;\n        if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(ii, jj) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::min(255, std::max(0, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int ii = -1; ii <= 1; ++ii) {\n                           for (int jj = -1; jj <= 1; ++jj) {\n                             int ii_in = i + ii;\n                             int jj_in = j + jj;\n                             if (ii_in >= 0 && ii_in < N && jj_in >= 0 && jj_in < N) {\n                               sum += imageIn(ii_in, jj_in) * edgeKernel[ii + 1][jj + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::max(0, std::min(255, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; k++) {\n      for (int l = -1; l <= 1; l++) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          int val = 0;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            val = imageIn(ii, jj);\n          }\n          sum += edgeKernel[ki + 1][kj + 1] * val;\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(sum, 255));\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N),\n                       KOKKOS_LAMBDA(const int i, const int j) {\n                         int sum = 0;\n                         for (int k = -1; k <= 1; ++k) {\n                           for (int l = -1; l <= 1; ++l) {\n                             int x = i + k;\n                             int y = j + l;\n                             if (x < 0 || x >= N || y < 0 || y >= N) {\n                               sum += 0;\n                             } else {\n                               sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n                             }\n                           }\n                         }\n                         imageOut(i, j) = std::min(255, std::max(0, sum));\n                       });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<int>(0, N), KOKKOS_LAMBDA(const int j) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ++ii) {\n        for (int jj = -1; jj <= 1; ++jj) {\n          int ii_clamped = std::clamp(i + ii, 0, N - 1);\n          int jj_clamped = std::clamp(j + jj, 0, N - 1);\n          sum += imageIn(ii_clamped, jj_clamped) * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut(i, j) = std::clamp(sum, -128, 127);\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t i) {\n    Kokkos::parallel_for(\"convolveKernel\", Kokkos::Range<size_t>(0, N), [=](size_t j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n",
+            "\n\n  Kok\uf09e::parallel_for(Kok\uf09e::Range(0, N), [&](const Kok\uf09e::Range &r) {\n    for (size_t i = r.begin(); i < r.end(); ++i) {\n      for (size_t j = 0; j < N; ++j) {\n        int sum = 0;\n        for (size_t k = 0; k < 3; ++k) {\n          for (size_t l = 0; l < 3; ++l) {\n            size_t ii = i + k - 1;\n            size_t jj = j + l - 1;\n            if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n              sum += 0;\n            } else {\n              sum += imageIn(ii, jj) * edge[k][l];\n            }\n          }\n        }\n        imageOut(i, j) = std::max(0, std::min(255, sum));\n      }\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"convolveKernel\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    int sum = 0;\n    for (int k = -1; k <= 1; ++k) {\n      for (int l = -1; l <= 1; ++l) {\n        int x = i + k;\n        int y = j + l;\n        if (x < 0 || x >= N || y < 0 || y >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn(x, y) * edgeKernel[k + 1][l + 1];\n        }\n      }\n    }\n    imageOut(i, j) = std::max(0, std::min(255, sum));\n  });\n}\n",
+            "\n\n  Kok\u30b3\u30af::parallel_for(0, N, [&](size_t i) {\n    Kok\u30b3\u30af::parallel_for(0, N, [&](size_t j) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn(x, y) * edge[k][l];\n          }\n        }\n      }\n      imageOut(i, j) = std::max(0, std::min(255, sum));\n    });\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "_",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          if (i + k < 0 || i + k >= N || j + l < 0 || j + l >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = (int)i + k;\n          int y = (int)j + l;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x >= 0 && x < static_cast<int>(N) && y >= 0 && y < static_cast<int>(N)) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          sum += imageIn[(i + k) * N + (j + l)] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "serial",
+        "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.258803886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2596294934
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2828613502,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2384233962
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2740478222,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.260495754
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int sum = 0;\n      for (int ii = -1; ii <= 1; ii++) {\n        for (int jj = -1; jj <= 1; jj++) {\n          sum += imageIn[(i + ii) * N + (j + jj)] * edgeKernel[ii + 1][jj + 1];\n        }\n      }\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": false,
+                "are_all_valid": false,
+                "best_sequential_runtime": null,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": false,
+                        "runtime": null
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2856939492,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2266884414
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2639936654,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2411483084
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2612945296,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2408771962
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x >= 0 && x < (int)N && y >= 0 && y < (int)N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.272614254,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2913473636
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x < 0 || x >= N || y < 0 || y >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2737851118,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2601044256
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t x = 0; x < 3; x++) {\n        for (size_t y = 0; y < 3; y++) {\n          int ii = i + x - 1;\n          int jj = j + y - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[x][y];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2891352886,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.240289592
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int pixelX = x + kx;\n          int pixelY = y + ky;\n          if (pixelX >= 0 && pixelX < N && pixelY >= 0 && pixelY < N) {\n            sum += imageIn[pixelY * N + pixelX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.271676401,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2620580624
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = (int)i + k - 1;\n          int y = (int)j + l - 1;\n          if (x < 0 || x >= (int)N || y < 0 || y >= (int)N) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2742731764,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2903679722
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t y = 0; y < N; ++y) {\n    for (size_t x = 0; x < N; ++x) {\n      int sum = 0;\n      for (int ky = -1; ky <= 1; ++ky) {\n        for (int kx = -1; kx <= 1; ++kx) {\n          int inX = x + kx;\n          int inY = y + ky;\n          if (inX >= 0 && inX < N && inY >= 0 && inY < N) {\n            sum += imageIn[inY * N + inX] * edgeKernel[ky + 1][kx + 1];\n          }\n        }\n      }\n      imageOut[y * N + x] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2729013042,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2638083146
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii < 0 || ii >= N || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.272050321,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.259869147
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2859272866,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2373233694
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; ++k) {\n        for (size_t l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.258675919,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2383802946
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii < 0 || ii >= static_cast<int>(N) || jj < 0 || jj >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.289216607,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2669134868
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.264683534,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2415963538
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ki++) {\n        for (int kj = -1; kj <= 1; kj++) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.283801167,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2389198726
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int x = i + k - 1;\n          int y = j + l - 1;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k][l];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.2579272842,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.2385438486
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "hip",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\nint sum = 0;\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x >= N || y >= N) {\n    return;\n  }\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x = blockIdx.x + i;\n      int y = blockIdx.y + j;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  sum = max(0, min(255, sum));\n  imageOut[blockIdx.y * N + blockIdx.x] = sum;\n}\n",
+            "\nint sum = 0;\nint index = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x;\nint x = index % N;\nint y = index / N;\nfor (int i = -1; i <= 1; i++) {\n  for (int j = -1; j <= 1; j++) {\n    if (x + i >= 0 && x + i < N && y + j >= 0 && y + j < N) {\n      sum += imageIn[(y + j) * N + x + i] * edgeKernel[i + 1][j + 1];\n    }\n  }\n}\nimageOut[index] = __clamp(sum, 0, 255);\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x2 = x + i;\n        int y2 = y + j;\n        if (x2 >= 0 && x2 < N && y2 >= 0 && y2 < N) {\n          sum += imageIn[y2 * N + x2] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = min(255, max(0, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_ = x + i;\n      int y_ = y + j;\n      if (x_ < 0 || x_ >= N || y_ < 0 || y_ >= N)\n        sum += 0;\n      else\n        sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\nint row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n    if (row < N && col < N) {\n        int sum = 0;\n        for (int i = -1; i <= 1; i++) {\n            for (int j = -1; j <= 1; j++) {\n                int r = row + i;\n                int c = col + j;\n                if (r >= 0 && r < N && c >= 0 && c < N) {\n                    sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n                }\n            }\n        }\n        imageOut[row * N + col] = clamp(sum, 0, 255);\n    }\n}\n",
+            "\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    int dotProduct = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          dotProduct += imageIn[neighborY * N + neighborX] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = dotProduct > 255 ? 255 : (dotProduct < 0 ? 0 : dotProduct);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = 0; i < 3; i++) {\n      for (int j = 0; j < 3; j++) {\n        int x_offset = x + i - 1;\n        int y_offset = y + j - 1;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edge[i][j];\n        }\n      }\n    }\n    imageOut[y * N + x] = __clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_index = x + i;\n        int y_index = y + j;\n        if (x_index >= 0 && x_index < N && y_index >= 0 && y_index < N) {\n          sum += imageIn[y_index * N + x_index] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[x_idx + y_idx * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int neighborX = x + i;\n        int neighborY = y + j;\n        if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n          sum += imageIn[neighborX + neighborY * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[x + y * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = hipMin(hipMax(sum, 0), 255);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image.\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows.\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = N / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = (rank + 1) * chunkSize;\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = (i + 1) * chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns in the image\n  size_t numRows = imageIn.size() / N;\n  size_t numCols = N;\n\n  // Compute the number of rows and columns in the output image\n  size_t outNumRows = numRows - 2;\n  size_t outNumCols = numCols - 2;\n\n  // Compute the number of rows and columns in the output image for each rank\n  size_t outNumRowsPerRank = outNumRows / size;\n  size_t outNumColsPerRank = outNumCols;\n\n  // Compute the starting row and column for each rank\n  size_t startRow = rank * outNumRowsPerRank;\n  size_t startCol = 0;\n\n  // Compute the ending row and column for each rank\n  size_t endRow = startRow + outNumRowsPerRank;\n  size_t endCol = startCol + outNumColsPerRank;\n\n  // Allocate memory for the output image\n  std::vector<int> outImage(outNumRowsPerRank * outNumColsPerRank);\n\n  // Compute the convolution for each pixel in the output image\n  for (size_t i = startRow; i < endRow; i++) {\n    for (size_t j = startCol; j < endCol; j++) {\n      int sum = 0;\n      for (size_t k = 0; k < 3; k++) {\n        for (size_t l = 0; l < 3; l++) {\n          int row = i + k - 1;\n          int col = j + l - 1;\n          if (row >= 0 && row < numRows && col >= 0 && col < numCols) {\n            sum += imageIn[row * numCols + col] * edgeKernel[k][l];\n          }\n        }\n      }\n      outImage[i * outNumColsPerRank + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the output images from all ranks\n  MPI_Gather(outImage.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT,\n             imageOut.data(), outNumRowsPerRank * outNumColsPerRank, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local portion of the output image.\n  std::vector<int> localImageOut(N * N / size);\n  for (size_t i = rank * N / size; i < (rank + 1) * N / size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the output image on rank 0.\n  if (rank == 0) {\n    imageOut.resize(N * N);\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, imageOut.data(), N * N / size, MPI_INT, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(localImageOut.data(), N * N / size, MPI_INT, nullptr, 0, MPI_INT, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageOut(localN * N);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows in imageOut\n  size_t rowsPerRank = N / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks and store them in imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      for (size_t j = startRow; j < endRow; ++j) {\n        MPI_Recv(&imageOut[j * N], N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n      }\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], N * rowsPerRank, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min(startRow + rowsPerRank, N);\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "{",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int chunkSize = N / size;\n  int startRow = rank * chunkSize;\n  int endRow = startRow + chunkSize;\n\n  // Pad the input image with zeros to handle edge cases\n  std::vector<int> paddedImage(N * N);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i * N + j] = imageIn[i * N + j];\n    }\n  }\n\n  // Compute the convolution for the assigned rows\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int neighborRow = i + k;\n          int neighborCol = j + l;\n          if (neighborRow >= 0 && neighborRow < N && neighborCol >= 0 && neighborCol < N) {\n            sum += paddedImage[neighborRow * N + neighborCol] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * chunkSize;\n      int endRow = startRow + chunkSize;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= N || neighborY < 0 || neighborY >= N) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(std::max(sum, 0), 255);\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of imageOut\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks into imageOut on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localN = N / size;\n  int localStart = rank * localN;\n  int localEnd = localStart + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = localStart; i < localEnd; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int localSize = N / size;\n  int remainder = N % size;\n  int startIndex = rank * localSize;\n  int endIndex = startIndex + localSize;\n  if (rank == size - 1) {\n    endIndex += remainder;\n  }\n\n  std::vector<int> localImageIn(localSize * N);\n  std::vector<int> localImageOut(localSize * N);\n\n  MPI_Scatter(imageIn.data(), N * N, MPI_INT, localImageIn.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  for (int i = startIndex; i < endIndex; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += localImageIn[(x + y * N)];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::min(255, std::max(0, sum * edgeKernel[k + 1][l + 1]));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localSize * N, MPI_INT, imageOut.data(), localSize * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of pixels per process\n  int pixelsPerProcess = N * N / size;\n\n  // Compute the starting and ending indices for this process\n  int startIndex = rank * pixelsPerProcess;\n  int stopIndex = (rank + 1) * pixelsPerProcess;\n\n  // Allocate memory for the local image\n  std::vector<int> localImage(pixelsPerProcess);\n\n  // Scatter the image data to each process\n  MPI_Scatter(&imageIn[0], pixelsPerProcess, MPI_INT, &localImage[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n\n  // Convolve the local image with the edge kernel\n  for (int i = 1; i < N - 1; i++) {\n    for (int j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          dotProduct += localImage[i + k][j + l] * edgeKernel[k + 1][l + 1];\n        }\n      }\n      imageOut[i * N + j] = std::clip(dotProduct, -128, 127);\n    }\n  }\n\n  // Gather the results from each process\n  MPI_Gather(&imageOut[startIndex], pixelsPerProcess, MPI_INT, &imageOut[0], pixelsPerProcess, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the subset of the image\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  size_t localN = (N + size - 1) / size;\n  size_t startRow = rank * localN;\n  size_t endRow = std::min(startRow + localN, N);\n\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = i + ki;\n          int jj = j + kj;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * localN;\n      size_t endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  for (size_t i = rank; i < N; i += size) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the pixel and its neighbors with the edge kernel\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Calculate the number of rows per process\n  int rowsPerProcess = N / size;\n\n  // Calculate the starting row for this process\n  int startRow = rank * rowsPerProcess;\n\n  // Calculate the ending row for this process\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Loop over the rows assigned to this process\n  for (int i = startRow; i < endRow; i++) {\n    // Loop over the columns in the row\n    for (int j = 0; j < N; j++) {\n      // Calculate the dot product of the pixel and its neighbors with the edge kernel\n      int dotProduct = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          // Check if the pixel is within the image boundaries\n          if (i + k >= 0 && i + k < N && j + l >= 0 && j + l < N) {\n            // Calculate the dot product\n            dotProduct += imageIn[(i + k) * N + j + l] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(imageOut.data() + startRow * N, rowsPerProcess * N, MPI_INT,\n             imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "cuda",
+        "prompt": "__constant__ int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\n__global__ void convolveKernel(const int *imageIn, int *imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x;\n  int idy = threadIdx.y + blockIdx.y * blockDim.y;\n  if (idx < N && idy < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = idx + i;\n        int y = idy + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[x + y * N] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx + idy * N] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\nint sum = 0;\n  for (int i = 0; i < 3; i++) {\n    for (int j = 0; j < 3; j++) {\n      int x = blockIdx.x * blockDim.x + i - 1;\n      int y = blockIdx.y * blockDim.y + j - 1;\n      if (x >= 0 && x < N && y >= 0 && y < N) {\n        sum += imageIn[y * N + x] * edgeKernel[i][j];\n      }\n    }\n  }\n  imageOut[blockIdx.y * N + blockIdx.x] = max(0, min(255, sum));\n}\n",
+            "\nint x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int row = blockIdx.y * blockDim.y + threadIdx.y;\n  int col = blockIdx.x * blockDim.x + threadIdx.x;\n  if (row < N && col < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int r = row + i;\n        int c = col + j;\n        if (r >= 0 && r < N && c >= 0 && c < N) {\n          sum += imageIn[r * N + c] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[row * N + col] = sum > 255 ? 255 : (sum < 0 ? 0 : sum);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x_in = x + i;\n      int y_in = y + j;\n      if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n        sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n  imageOut[y * N + x] = min(255, max(0, sum));\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x >= N || y >= N) {\n    return;\n  }\n\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int nx = x + i;\n      int ny = y + j;\n      if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n        sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n      }\n    }\n  }\n\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset < 0 || x_offset >= N || y_offset < 0 || y_offset >= N) {\n          sum += 0;\n        } else {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = std::max(0, std::min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  int sum = 0;\n  for (int i = -1; i <= 1; i++) {\n    for (int j = -1; j <= 1; j++) {\n      int x2 = x + i;\n      int y2 = y + j;\n      if (x2 < 0 || x2 >= N || y2 < 0 || y2 >= N) {\n        sum += 0;\n      } else {\n        sum += edgeKernel[i + 1][j + 1] * imageIn[y2 * N + x2];\n      }\n    }\n  }\n  imageOut[y * N + x] = clamp(sum, 0, 255);\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int nx = x + i;\n        int ny = y + j;\n        if (nx >= 0 && nx < N && ny >= 0 && ny < N) {\n          sum += imageIn[ny * N + nx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_offset = x + i;\n        int y_offset = y + j;\n        if (x_offset >= 0 && x_offset < N && y_offset >= 0 && y_offset < N) {\n          sum += imageIn[y_offset * N + x_offset] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_in = x + i;\n        int y_in = y + j;\n        if (x_in >= 0 && x_in < N && y_in >= 0 && y_in < N) {\n          sum += imageIn[y_in * N + x_in] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = clamp(sum, 0, 255);\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_coord = x + i;\n        int y_coord = y + j;\n        if (x_coord >= 0 && x_coord < N && y_coord >= 0 && y_coord < N) {\n          sum += imageIn[y_coord * N + x_coord] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = min(255, max(0, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_ = x + i;\n        int y_ = y + j;\n        if (x_ >= 0 && x_ < N && y_ >= 0 && y_ < N) {\n          sum += imageIn[y_ * N + x_] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n",
+            "\nint idx = threadIdx.x + blockIdx.x * blockDim.x + (threadIdx.y + blockIdx.y * blockDim.y) * N;\n  if (idx < N * N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x = threadIdx.x + i;\n        int y = threadIdx.y + j;\n        if (x >= 0 && x < N && y >= 0 && y < N) {\n          sum += imageIn[y * N + x] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[idx] = max(0, min(255, sum));\n  }\n}\n",
+            "\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    int sum = 0;\n    for (int i = -1; i <= 1; i++) {\n      for (int j = -1; j <= 1; j++) {\n        int x_idx = x + i;\n        int y_idx = y + j;\n        if (x_idx >= 0 && x_idx < N && y_idx >= 0 && y_idx < N) {\n          sum += imageIn[y_idx * N + x_idx] * edgeKernel[i + 1][j + 1];\n        }\n      }\n    }\n    imageOut[y * N + x] = max(0, min(255, sum));\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "51_stencil_edge_kernel",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\nconst int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n   the dot product of itself and its neighbors with the edge kernel.\n   Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n   imageIn and imageOut are NxN grayscale images stored in row-major.\n   Store the output of the computation in imageOut.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of imageIn. The result is stored in imageOut on rank 0.\n   Example:\n\n   input: [[112, 118, 141, 152],\n           [93, 101, 119, 203],\n           [45, 17, 16, 232],\n           [82, 31, 49, 101]]\n   output: [[255, 255, 255, 255],\n            [255, 147, 0, 255],\n            [36, 0, 0, 255],\n            [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector<int> const& imageIn, std::vector<int> &imageOut, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the output image\n  int localN = N / size;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local subset of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  int nRowsPerRank = (N + size - 1) / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = std::min(N, (rank + 1) * nRowsPerRank);\n\n  std::vector<int> localImageOut(nRowsPerRank * N);\n\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = 0; k < 3; k++) {\n        for (int l = 0; l < 3; l++) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), nRowsPerRank * N, MPI_INT,\n              imageOut.data(), nRowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Initialize imageOut to 0\n  imageOut.assign(N * N, 0);\n\n  // Each rank computes its own part of the output\n  #pragma opm parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel and its neighbors\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          int ii = i + ki - 1;\n          int jj = j + kj - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki][kj];\n          }\n        }\n      }\n      // Clip the output between 0 and 255\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results on rank 0\n  MPI_Gather(imageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a portion of the output image\n  size_t chunkSize = (N + size - 1) / size;\n  size_t startRow = rank * chunkSize;\n  size_t endRow = std::min(startRow + chunkSize, N);\n\n  // Compute the convolution for the assigned rows\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * chunkSize;\n      size_t endRow = std::min(startRow + chunkSize, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image\n  size_t rowsPerRank = (N + size - 1) / size;\n  size_t startRow = rank * rowsPerRank;\n  size_t endRow = std::min(startRow + rowsPerRank, N);\n\n  // Each rank computes its subset of the output image in parallel using OpenMP\n  #pragma omp parallel for\n  for (size_t i = startRow; i < endRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int ki = -1; ki <= 1; ++ki) {\n        for (int kj = -1; kj <= 1; ++kj) {\n          int ii = static_cast<int>(i) + ki;\n          int jj = static_cast<int>(j) + kj;\n          if (ii >= 0 && ii < static_cast<int>(N) && jj >= 0 && jj < static_cast<int>(N)) {\n            sum += imageIn[ii * N + jj] * edgeKernel[ki + 1][kj + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t startRow = i * rowsPerRank;\n      size_t endRow = std::min(startRow + rowsPerRank, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = (N + nprocs - 1) / nprocs;\n  int startRow = rank * localN;\n  int endRow = std::min(startRow + localN, N);\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; ++i) {\n      int startRow = i * localN;\n      int endRow = std::min(startRow + localN, N);\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank will compute a subset of the rows of the output image\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the assigned rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      // Compute the dot product of the kernel and the pixel's neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = i + k;\n          int neighborY = j + l;\n          // Check if the neighbor is within the image boundaries\n          if (neighborX >= 0 && neighborX < N && neighborY >= 0 && neighborY < N) {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::min(255, std::max(0, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], rowsPerRank * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a portion of the output image\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  // Compute the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * localN;\n      int endRow = startRow + localN;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], localN * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the image\n  int rowsPerRank = (N + size - 1) / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Allocate memory for the rank's portion of the output image\n  std::vector<int> rankImageOut(rowsPerRank * N);\n\n  // Compute the convolution for the rank's subset of rows\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = 0; k < 3; ++k) {\n        for (int l = 0; l < 3; ++l) {\n          int ii = i + k - 1;\n          int jj = j + l - 1;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k][l];\n          }\n        }\n      }\n      rankImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(rankImageOut.data(), rowsPerRank * N, MPI_INT,\n             imageOut.data(), rowsPerRank * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Compute the dot product of the kernel and the image for each pixel\n  #pragma opm parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      int dotProduct = 0;\n      for (int ki = 0; ki < 3; ki++) {\n        for (int kj = 0; kj < 3; kj++) {\n          dotProduct += edgeKernel[ki][kj] * imageIn[(i + ki - 1) * N + (j + kj - 1)];\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, dotProduct));\n    }\n  }\n\n  // Set the boundaries to 0\n  for (size_t i = 0; i < N; i++) {\n    imageOut[i] = 0;\n    imageOut[i * N] = 0;\n    imageOut[i * N + N - 1] = 0;\n  }\n  for (size_t j = 0; j < N; j++) {\n    imageOut[j * N + N - 1] = 0;\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / nprocs;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Pad the image with zeros\n  std::vector<int> paddedImage(N + 2 * rowsPerRank, 0);\n  for (int i = 0; i < N; i++) {\n    for (int j = 0; j < N; j++) {\n      paddedImage[i + rowsPerRank + 1][j + rowsPerRank + 1] = imageIn[i * N + j];\n    }\n  }\n\n  // Convolve the kernel with the padded image\n  #pragma omp parallel for\n  for (int i = startRow + 1; i < endRow + 1; i++) {\n    for (int j = 1; j < N + 1; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          sum += edgeKernel[k + 1][l + 1] * paddedImage[i + k][j + l];\n        }\n      }\n      imageOut[(i - rowsPerRank - 1) * N + j - 1] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < nprocs; i++) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes its portion of the output image\n  #pragma omp parallel for\n  for (int i = rank; i < N; i += size) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int x = i + k;\n          int y = j + l;\n          if (x >= 0 && x < N && y >= 0 && y < N) {\n            sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Rank 0 gathers the results from all other ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&imageOut[i * N * N], N * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[0], N * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = std::min((rank + 1) * rowsPerRank, N);\n\n  // Each rank computes its subset of the output image\n  std::vector<int> localImageOut(N * N);\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(localImageOut.data(), N * N, MPI_INT, imageOut.data(), N * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  int localN = N / nprocs;\n  int startRow = rank * localN;\n  int endRow = startRow + localN;\n\n  std::vector<int> localImageIn(localN * N);\n  std::vector<int> localImageOut(localN * N);\n\n  MPI_Scatter(imageIn.data(), localN * N, MPI_INT, localImageIn.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n\n  #pragma omp parallel for\n  for (int i = 0; i < localN; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii < 0 || ii >= localN || jj < 0 || jj >= N) {\n            sum += 0;\n          } else {\n            sum += localImageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  MPI_Gather(localImageOut.data(), localN * N, MPI_INT, imageOut.data(), localN * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int x = static_cast<int>(i) + k;\n          int y = static_cast<int>(j) + l;\n          if (x < 0 || x >= static_cast<int>(N) || y < 0 || y >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[x * N + y] * edge[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output image.\n  int rowsPerRank = N / size;\n  int startRow = rank * rowsPerRank;\n  int endRow = startRow + rowsPerRank;\n\n  // Compute the convolution for the subset of rows assigned to this rank.\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; ++i) {\n    for (int j = 0; j < N; ++j) {\n      int sum = 0;\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the results from all ranks on rank 0.\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      int startRow = i * rowsPerRank;\n      int endRow = startRow + rowsPerRank;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int myRank, numRanks;\n  MPI_Comm_rank(MPI_COMM_WORLD, &myRank);\n  MPI_Comm_size(MPI_COMM_WORLD, &numRanks);\n\n  // Each rank computes a portion of the output image\n  size_t myStartRow = myRank * N / numRanks;\n  size_t myEndRow = (myRank + 1) * N / numRanks;\n\n  // Compute the convolution for each pixel in the assigned rows\n  #pragma omp parallel for\n  for (size_t i = myStartRow; i < myEndRow; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      int sum = 0;\n\n      // Apply the kernel to the pixel and its neighbors\n      for (int k = -1; k <= 1; ++k) {\n        for (int l = -1; l <= 1; ++l) {\n          int neighborX = static_cast<int>(i) + k;\n          int neighborY = static_cast<int>(j) + l;\n\n          // Handle pixels outside the image boundaries\n          if (neighborX < 0 || neighborX >= static_cast<int>(N) || neighborY < 0 || neighborY >= static_cast<int>(N)) {\n            sum += 0;\n          } else {\n            sum += imageIn[neighborX * N + neighborY] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n\n      // Clip the output to the range [0, 255]\n      imageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the partial results from all ranks on rank 0\n  if (myRank == 0) {\n    for (int i = 1; i < numRanks; ++i) {\n      size_t startRow = i * N / numRanks;\n      size_t endRow = (i + 1) * N / numRanks;\n      MPI_Recv(&imageOut[startRow * N], (endRow - startRow) * N, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&imageOut[myStartRow * N], (myEndRow - myStartRow) * N, MPI_INT, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Each process computes a subset of the rows\n  int rowsPerProcess = N / nprocs;\n  int startRow = rank * rowsPerProcess;\n  int endRow = (rank + 1) * rowsPerProcess;\n\n  // Allocate memory for the local portion of the output image\n  std::vector<int> localImageOut(rowsPerProcess * N);\n\n  // Perform the convolution for the local portion of the image\n  #pragma omp parallel for\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < N; j++) {\n      int sum = 0;\n      for (int k = -1; k <= 1; k++) {\n        for (int l = -1; l <= 1; l++) {\n          int ii = i + k;\n          int jj = j + l;\n          if (ii >= 0 && ii < N && jj >= 0 && jj < N) {\n            sum += imageIn[ii * N + jj] * edgeKernel[k + 1][l + 1];\n          }\n        }\n      }\n      localImageOut[i * N + j] = std::max(0, std::min(255, sum));\n    }\n  }\n\n  // Gather the local results into the global output image\n  MPI_Gather(localImageOut.data(), rowsPerProcess * N, MPI_INT,\n              imageOut.data(), rowsPerProcess * N, MPI_INT, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "kokkos",
+        "prompt": "#include <Kokkos_Core.hpp>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids.\n   Use Kokkos to compute in parallel. Assume Kokkos has already been initialized.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(Kokkos::View<const double**> &input, Kokkos::View<double**> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t i) {\n    Kokkos::parallel_for(N, KOKKOS_LAMBDA(size_t j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int &i, const int &j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int dx = -1; dx <= 1; dx++) {\n      for (int dy = -1; dy <= 1; dy++) {\n        if (i + dx >= 0 && i + dx < N && j + dy >= 0 && j + dy < N) {\n          sum += input(i + dx, j + dy);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "Kokkosspecific",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const size_t& j) {\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        output(i, j) = 0;\n      } else {\n        output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n      }\n    });\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    for (int ii = -1; ii <= 1; ++ii) {\n      for (int jj = -1; jj <= 1; ++jj) {\n        if (i + ii >= 0 && i + ii < N && j + jj >= 0 && j + jj < N) {\n          sum += input(i + ii, j + jj);\n        }\n      }\n    }\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>(N, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  using team_member = KokKOS::TeamMember;\n  using team_view = KokKOS::TeamView<double*>;\n  using team_view_const = KokKOS::TeamView<const double*>;\n  using view_type = KokKOS::View<double**>;\n  using view_type_const = KokKOS::View<const double**>;\n\n  auto policy = KokKOS::TeamStriped<>(team_member(0, N), team_member(0, N));\n\n  team_view output_team(\"output_team\", output, policy);\n  team_view_const input_team(\"input_team\", input, policy);\n\n  auto lambda = KOKKOS_LAMBDA(team_member member) {\n    const size_t i = member.team_member_rank(0);\n    const size_t j = member.team_member_rank(1);\n    if (i == 0 || j == 0 || i == N - 1 || j == N - 1) {\n      output_team(i, j) = input_team(i, j);\n    } else {\n      output_team(i, j) = (input_team(i, j - 1) + input_team(i, j + 1) + input_team(i - 1, j) + input_team(i + 1, j) + input_team(i, j)) / 5.0;\n    }\n  };\n\n  auto result = KokKOS::parallel_reduce(\n      output_team, lambda, KokKOS::PerTeamReduce(std::plus<double>(), 0.0));\n  output_team.wait_until_done();\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5.0;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int i) {\n    Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), [=](const int j) {\n      double left = (i == 0) ? 0 : input(i, j - 1);\n      double right = (i == N - 1) ? 0 : input(i, j + 1);\n      double up = (j == 0) ? 0 : input(i - 1, j);\n      double down = (j == N - 1) ? 0 : input(i + 1, j);\n      output(i, j) = (left + right + up + down + input(i, j)) / 5.0;\n    });\n  });\n}\n",
+            "\nfor (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  }\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::Rank<2>>({0, N}, {0, N}), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    double sum = 0;\n    sum += (i == 0 || j == 0) ? 0 : input(i - 1, j - 1);\n    sum += (i == 0) ? 0 : input(i - 1, j);\n    sum += (i == 0 || j == N - 1) ? 0 : input(i - 1, j + 1);\n    sum += (j == 0) ? 0 : input(i, j - 1);\n    sum += input(i, j);\n    sum += (j == N - 1) ? 0 : input(i, j + 1);\n    sum += (i == N - 1 || j == 0) ? 0 : input(i + 1, j - 1);\n    sum += (i == N - 1) ? 0 : input(i + 1, j);\n    sum += (i == N - 1 || j == N - 1) ? 0 : input(i + 1, j + 1);\n    output(i, j) = sum / 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    output(i, j) = 0;\n    if (i > 0) output(i, j) += input(i - 1, j);\n    if (i < N - 1) output(i, j) += input(i + 1, j);\n    if (j > 0) output(i, j) += input(i, j - 1);\n    if (j < N - 1) output(i, j) += input(i, j + 1);\n    output(i, j) += input(i, j);\n    output(i, j) /= 5;\n  });\n}\n",
+            "\n\n  Kokkos::parallel_for(\"jacobi2D\", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, N), KOKKOS_LAMBDA(const int i, const int j) {\n    if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n      output(i, j) = 0;\n    } else {\n      output(i, j) = (input(i, j - 1) + input(i, j + 1) + input(i - 1, j) + input(i + 1, j) + input(i, j)) / 5;\n    }\n  });\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "omp",
+        "prompt": "#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use OpenMP to compute in parallel.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] =\n          (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n          5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                             input[(i + 1) * N + j] + input[i * N + j]) /\n                            5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  output.resize(input.size());\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i - 1][j] + input[i + 1][j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] +\n                            input[(i + 1) * N + j] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5.0;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n                            input[i * N + j + 1] + input[i * N + j]) /\n                           5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 1; i < N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "serial",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306976014,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6728002446
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307042622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321894514
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306352364,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.682640737
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0309653156,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321279134
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0302834662,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6709337094
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0307491626,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0307666516
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0310593342,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0323606734
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030632455,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6670937744
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.030408766,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6714963824
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0304589622,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6852144114
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0311442984,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0322620682
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 1; i < N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0313032814,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0192370456
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305634672,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6876750302
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0309805284,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321549104
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306379252,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6802844104
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t idx = i * N + j;\n      if (i > 0) sum += input[idx - N];\n      if (i < N - 1) sum += input[idx + N];\n      if (j > 0) sum += input[idx - 1];\n      if (j < N - 1) sum += input[idx + 1];\n      sum += input[idx];\n      output[idx] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0309723386,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321262184
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0309660868,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.0321251286
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305535314,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6882300308
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1 + j * N];\n      if (i < N - 1) sum += input[i + 1 + j * N];\n      if (j > 0) sum += input[i + (j - 1) * N];\n      if (j < N - 1) sum += input[i + (j + 1) * N];\n      sum += input[i + j * N];\n      output[i + j * N] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0305051624,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 2.6729669826
+                    }
+                ]
+            },
+            {
+                "generated_output": "\n\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      size_t index = i * N + j;\n      if (i > 0) sum += input[index - N];\n      if (i < N - 1) sum += input[index + N];\n      if (j > 0) sum += input[index - 1];\n      if (j < N - 1) sum += input[index + 1];\n      sum += input[index];\n      output[index] = sum / 5;\n    }\n  }\n}\n",
+                "source_write_success": true,
+                "did_build": true,
+                "is_source_valid": true,
+                "did_any_run": true,
+                "did_all_run": true,
+                "are_any_valid": true,
+                "are_all_valid": true,
+                "best_sequential_runtime": 0.0306947432,
+                "runs": [
+                    {
+                        "did_run": true,
+                        "is_valid": true,
+                        "runtime": 0.031820153
+                    }
+                ]
+            }
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "hip",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use AMD HIP to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\nint i = blockIdx.x * blockDim.x + threadIdx.x;\nint j = blockIdx.y * blockDim.y + threadIdx.y;\nif (i < N && j < N) {\n  output[i * N + j] =\n      (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) /\n      5;\n}\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (x < N && y < N) {\n    output[y * N + x] =\n        (input[y * N + (x - 1 < 0 ? 0 : x - 1)] +\n         input[y * N + (x + 1 >= N ? N - 1 : x + 1)] +\n         input[(y - 1 < 0 ? 0 : y - 1) * N + x] +\n         input[(y + 1 >= N ? N - 1 : y + 1) * N + x] + input[y * N + x]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = i > 0 ? input[i - 1 + j * N] : 0;\n    double right = i < N - 1 ? input[i + 1 + j * N] : 0;\n    double up = j > 0 ? input[i + (j - 1) * N] : 0;\n    double down = j < N - 1 ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x < N && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) {\n    return;\n  }\n  double sum = 0;\n  if (i > 0) {\n    sum += input[i - 1 + j * N];\n  }\n  if (i < N - 1) {\n    sum += input[i + 1 + j * N];\n  }\n  if (j > 0) {\n    sum += input[i + (j - 1) * N];\n  }\n  if (j < N - 1) {\n    sum += input[i + (j + 1) * N];\n  }\n  sum += input[i + j * N];\n  output[i + j * N] = sum / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i < N && j < N) {\n    double sum = 0;\n    if (i > 0) sum += input[i - 1 + j * N];\n    if (i < N - 1) sum += input[i + 1 + j * N];\n    if (j > 0) sum += input[i + (j - 1) * N];\n    if (j < N - 1) sum += input[i + (j + 1) * N];\n    sum += input[i + j * N];\n    output[i + j * N] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int j = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] =\n      (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) /\n      5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            ",",
+            "\n\n  int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;\n  int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;\n\n  if (x >= 0 && x < N && y >= 0 && y < N) {\n    double left = (x > 0) ? input[y * N + x - 1] : 0;\n    double right = (x < N - 1) ? input[y * N + x + 1] : 0;\n    double up = (y > 0) ? input[(y - 1) * N + x] : 0;\n    double down = (y < N - 1) ? input[(y + 1) * N + x] : 0;\n    double center = input[y * N + x];\n    output[y * N + x] = (left + right + up + down + center) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi",
+        "prompt": "#include <mpi.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[start * N], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N / size;\n  int nCols = N;\n  std::vector<double> localInput(nRows * nCols);\n  std::vector<double> localOutput(nRows * nCols);\n  MPI_Scatter(input.data(), nRows * nCols, MPI_DOUBLE, localInput.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  for (int i = 0; i < nRows; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : localInput[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : localInput[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : localInput[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : localInput[(i + 1) * nCols + j];\n      localOutput[i * nCols + j] = (left + right + up + down + localInput[i * nCols + j]) / 5;\n    }\n  }\n  MPI_Gather(localOutput.data(), nRows * nCols, MPI_DOUBLE, output.data(), nRows * nCols, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Compute the local jacobi stencil\n  for (size_t i = 1; i < local_N - 1; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the local output\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the offset of the local grid\n  size_t offset = rank * local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Copy the relevant part of the input grid to the local grid\n  MPI_Scatter(input.data(), local_N * N, MPI_DOUBLE, local_input.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Compute the local jacobi iteration\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the output grid on rank 0\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N * N], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output grid\n  std::vector<double> local_output(local_N * N, 0);\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output grid\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local output grids into the global output grid\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Iterate over the local grid\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the average in the output grid\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = (rank + 1) * local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min((rank + 1) * local_N, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < N - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n\n      // Store the average in the output\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and size of the MPI communicator\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows and columns per rank\n  int rows_per_rank = N / size;\n  int cols_per_rank = N;\n\n  // Compute the starting row and column for this rank\n  int start_row = rank * rows_per_rank;\n  int start_col = 0;\n\n  // Compute the ending row and column for this rank\n  int end_row = start_row + rows_per_rank;\n  int end_col = start_col + cols_per_rank;\n\n  // Create a buffer to store the local results\n  std::vector<double> local_output(rows_per_rank * cols_per_rank);\n\n  // Compute the local results\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = start_col; j < end_col; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local results\n      local_output[(i - start_row) * cols_per_rank + (j - start_col)] = avg;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, output.data(), rows_per_rank * cols_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local output\n  std::vector<double> local_output(local_size * N);\n  for (size_t i = start; i < end; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the local output\n      local_output[i - start][j] = avg;\n    }\n  }\n\n  // Gather the local outputs into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, num_procs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);\n\n  // Compute the number of rows per process\n  int rows_per_proc = N / num_procs;\n\n  // Compute the starting and ending rows for this process\n  int start_row = rank * rows_per_proc;\n  int end_row = (rank + 1) * rows_per_proc;\n\n  // Iterate over the rows assigned to this process\n  for (int i = start_row; i < end_row; ++i) {\n    // Iterate over the columns\n    for (int j = 0; j < N; ++j) {\n      // Compute the average of the neighbors\n      double avg = 0;\n      if (i > 0) avg += input[i - 1][j];\n      if (i < N - 1) avg += input[i + 1][j];\n      if (j > 0) avg += input[i][j - 1];\n      if (j < N - 1) avg += input[i][j + 1];\n      avg += input[i][j];\n      avg /= 5;\n\n      // Store the average in the output\n      output[i * N + j] = avg;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(output.data(), N * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local output for this rank\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local outputs into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  } else {\n    MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, nullptr, 0, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "cuda",
+        "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use CUDA to compute in parallel. The kernel is launched on an NxN grid of threads.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\n__global__ void jacobi2D(const double *input, double *output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] +\n         input[i * N + j + N] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i == 0) ? 0 : input[i - 1 + j * N];\n    double right = (i == N - 1) ? 0 : input[i + 1 + j * N];\n    double up = (j == 0) ? 0 : input[i + (j - 1) * N];\n    double down = (j == N - 1) ? 0 : input[i + (j + 1) * N];\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] +\n         input[i * N + j + 1] + input[i * N + j]) /\n        5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\nint i = blockIdx.x + blockIdx.y * blockDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] +\n                          input[i * N + j - 1] + input[i * N + j + 1] +\n                          input[i * N + j]) /\n                         5;\n  }\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i >= N || j >= N) return;\n  output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + (j - 1 + N) % N] + input[i * N + (j + 1) % N] +\n         input[(i - 1 + N) % N * N + j] + input[(i + 1) % N * N + j] +\n         input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockIdx.y * gridDim.x;\n  int j = threadIdx.x + threadIdx.y * blockDim.x;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    double center = input[i + j * N];\n    output[i + j * N] = (left + right + up + down + center) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n  if (x >= N || y >= N) return;\n  output[y * N + x] = (input[y * N + x - 1] + input[y * N + x + 1] + input[(y - 1) * N + x] + input[(y + 1) * N + x] + input[y * N + x]) / 5;\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] =\n        (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] +\n         input[i * N + N + j] + input[i * N + j]) /\n        5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    double left = (i > 0) ? input[i - 1 + j * N] : 0;\n    double right = (i < N - 1) ? input[i + 1 + j * N] : 0;\n    double up = (j > 0) ? input[i + (j - 1) * N] : 0;\n    double down = (j < N - 1) ? input[i + (j + 1) * N] : 0;\n    output[i + j * N] = (left + right + up + down + input[i + j * N]) / 5;\n  }\n}\n",
+            "\n\n  int x = blockIdx.x * blockDim.x + threadIdx.x;\n  int y = blockIdx.y * blockDim.y + threadIdx.y;\n\n  if (x < N && y < N) {\n    double sum = 0;\n    if (x > 0) sum += input[y * N + x - 1];\n    if (x < N - 1) sum += input[y * N + x + 1];\n    if (y > 0) sum += input[(y - 1) * N + x];\n    if (y < N - 1) sum += input[(y + 1) * N + x];\n    sum += input[y * N + x];\n    output[y * N + x] = sum / 5;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x + blockDim.x * gridDim.x;\n  int j = blockIdx.y + blockDim.y * gridDim.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5.0;\n  }\n}\n",
+            "\n\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  int j = blockIdx.y * blockDim.y + threadIdx.y;\n  if (i < N && j < N) {\n    output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N - N + j] + input[i * N + N + j] + input[i * N + j]) / 5;\n  }\n}\n"
+        ]
+    },
+    {
+        "problem_type": "stencil",
+        "language": "cpp",
+        "name": "53_stencil_2d_jacobi_5-point_stencil",
+        "parallelism_model": "mpi+omp",
+        "prompt": "#include <mpi.h>\n#include <omp.h>\n\n/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n   Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n   i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n   Replace with 0 when reading past the boundaries of `input`.\n   `input` and `output` are NxN grids stored in row-major.\n   Use MPI and OpenMP to compute in parallel. Assume MPI has already been initialized.\n   Every rank has a complete copy of input. The result is stored in output on rank 0.\n   Example:\n\n   input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n   output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector<double> const& input, std::vector<double> &output, size_t N) {",
+        "temperature": 0.2,
+        "top_p": 0.95,
+        "do_sample": true,
+        "max_new_tokens": 1024,
+        "outputs": [
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different portion of the output\n  size_t chunk_size = (N + size - 1) / size;\n  size_t start_row = rank * chunk_size;\n  size_t end_row = std::min(start_row + chunk_size, N);\n\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; i++) {\n    for (size_t j = 0; j < N; j++) {\n      // Compute the average of the neighbors\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n\n      // Store the result in the output\n      output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      size_t start_row = i * chunk_size;\n      size_t end_row = std::min(start_row + chunk_size, N);\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(local_N * N);\n  std::vector<double> local_output(local_N * N);\n  for (size_t i = 0; i < local_N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      local_input[i * N + j] = input[(start + i) * N + j];\n    }\n  }\n\n  // Compute the local jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int n_local = N / size;\n  int start = rank * n_local;\n  int end = start + n_local;\n\n  #pragma omp parallel for\n  for (int i = start; i < end; i++) {\n    for (int j = 0; j < N; j++) {\n      double left = (j == 0) ? 0 : input[i * N + j - 1];\n      double right = (j == N - 1) ? 0 : input[i * N + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * N + j];\n      double down = (i == N - 1) ? 0 : input[(i + 1) * N + j];\n      output[i * N + j] = (left + right + up + down + input[i * N + j]) / 5;\n    }\n  }\n\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      MPI_Recv(&output[i * n_local * N], n_local * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start * N], n_local * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start * N, input.begin() + end * N);\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] +\n                                  local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] +\n                                  local_input[i * N + j]) /\n                                 5.0;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = (N + size - 1) / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = std::min(start_row + local_N, N);\n\n  // Create a local copy of the input\n  std::vector<double> local_input(local_N * N);\n  MPI_Scatter(input.data(), N * N, MPI_DOUBLE, local_input.data(), N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n\n  // Create a local copy of the output\n  std::vector<double> local_output(local_N * N);\n\n  // Compute the local part of the jacobi stencil\n  #pragma omp parallel for collapse(2)\n  for (size_t i = 1; i < end_row - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (local_input[i * N + j - 1] + local_input[i * N + j + 1] + local_input[(i - 1) * N + j] + local_input[(i + 1) * N + j] + local_input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_N = N / size;\n  size_t start_row = rank * local_N;\n  size_t end_row = start_row + local_N;\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_N * N, 0);\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_N - 1; i++) {\n    for (size_t j = 1; j < N - 1; j++) {\n      local_output[i * N + j] = (input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_N * N, MPI_DOUBLE, output.data(), local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are on the boundary\n      if (i == 0 || i == N - 1 || j == 0 || j == N - 1) {\n        // If we are on the boundary, set the value to 0\n        local_output[i * N + j] = 0;\n      } else {\n        // Otherwise, compute the average of the neighbors\n        sum += input[(i - 1) * N + j];\n        sum += input[(i + 1) * N + j];\n        sum += input[i * N + (j - 1)];\n        sum += input[i * N + (j + 1)];\n        sum += input[i * N + j];\n        local_output[i * N + j] = sum / 5;\n      }\n    }\n  }\n\n  // Gather the local results into the output on rank 0\n  MPI_Gather(local_output.data(), rows_per_rank * N, MPI_DOUBLE, output.data(), rows_per_rank * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min((rank + 1) * rows_per_rank, N);\n\n  // Compute the jacobi stencil for the subset of rows assigned to this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      size_t start_row = i * rows_per_rank;\n      size_t end_row = std::min((i + 1) * rows_per_rank, N);\n      MPI_Recv(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row][0], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows of the output\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[i * N + j - N] + input[i * N + j + N] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  if (rank == 0) {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  // Get the rank and number of processes\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_N = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      // Check if we are at the boundary\n      if (i == 0) {\n        sum += 0;\n      } else {\n        sum += input[i - 1][j];\n      }\n      if (i == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i + 1][j];\n      }\n      if (j == 0) {\n        sum += 0;\n      } else {\n        sum += input[i][j - 1];\n      }\n      if (j == N - 1) {\n        sum += 0;\n      } else {\n        sum += input[i][j + 1];\n      }\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all processes\n  MPI_Gather(&output[start][0], local_N * N, MPI_DOUBLE, &output[0][0], local_N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Each rank computes a different subset of the rows\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(N, (rank + 1) * rows_per_rank);\n\n  // Compute the local subset of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 1; j < N - 1; ++j) {\n      local_output[i * N + j] = (input[i * N + j - 1] + input[i * N + j + 1] + input[(i - 1) * N + j] + input[(i + 1) * N + j] + input[i * N + j]) / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  MPI_Gather(local_output.data(), local_output.size(), MPI_DOUBLE, output.data(), output.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the local size of the grid\n  size_t local_size = N / nprocs;\n\n  // Compute the start and end indices of the local grid\n  size_t start = rank * local_size;\n  size_t end = (rank + 1) * local_size;\n\n  // Compute the local grid\n  std::vector<double> local_input(input.begin() + start, input.begin() + end);\n  std::vector<double> local_output(local_size * local_size);\n\n  // Compute the local jacobi iteration\n  #pragma omp parallel for\n  for (size_t i = 1; i < local_size - 1; i++) {\n    for (size_t j = 1; j < local_size - 1; j++) {\n      local_output[i * local_size + j] = (local_input[i * local_size + j - 1] + local_input[i * local_size + j + 1] +\n                                           local_input[(i - 1) * local_size + j] + local_input[(i + 1) * local_size + j] +\n                                           local_input[i * local_size + j]) /\n                                          5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * local_size, MPI_DOUBLE, output.data(), local_size * local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute local bounds\n  size_t local_N = N / size;\n  size_t start = rank * local_N;\n  size_t end = start + local_N;\n\n  // Compute local jacobi2D\n  #pragma omp parallel for\n  for (size_t i = start; i < end; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i][j] = sum / 5;\n    }\n  }\n\n  // Gather results to rank 0\n  if (rank != 0) {\n    MPI_Send(&output[start][0], local_N * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  } else {\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * local_N][0], local_N * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the local size of the grid\n  size_t local_size = (N + size - 1) / size;\n  size_t start = rank * local_size;\n  size_t end = std::min((rank + 1) * local_size, N);\n\n  // Compute the local part of the output\n  std::vector<double> local_output(local_size * N);\n  #pragma omp parallel for\n  for (size_t i = 0; i < local_size; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[(i - 1) * N + j];\n      if (i < local_size - 1) sum += input[(i + 1) * N + j];\n      if (j > 0) sum += input[i * N + j - 1];\n      if (j < N - 1) sum += input[i * N + j + 1];\n      sum += input[i * N + j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output\n  MPI_Gather(local_output.data(), local_size * N, MPI_DOUBLE, output.data(), local_size * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int nRows = N;\n  int nCols = N;\n  int nRowsPerRank = nRows / size;\n  int startRow = rank * nRowsPerRank;\n  int endRow = (rank + 1) * nRowsPerRank;\n  if (rank == size - 1) {\n    endRow = nRows;\n  }\n  #pragma omp parallel for collapse(2)\n  for (int i = startRow; i < endRow; i++) {\n    for (int j = 0; j < nCols; j++) {\n      double left = (j == 0) ? 0 : input[i * nCols + j - 1];\n      double right = (j == nCols - 1) ? 0 : input[i * nCols + j + 1];\n      double up = (i == 0) ? 0 : input[(i - 1) * nCols + j];\n      double down = (i == nRows - 1) ? 0 : input[(i + 1) * nCols + j];\n      output[i * nCols + j] = (left + right + up + down + input[i * nCols + j]) / 5;\n    }\n  }\n  if (rank != 0) {\n    MPI_Send(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int startRow = i * nRowsPerRank;\n      int endRow = (i + 1) * nRowsPerRank;\n      if (i == size - 1) {\n        endRow = nRows;\n      }\n      MPI_Recv(&output[startRow * nCols], (endRow - startRow) * nCols, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  }\n}\n",
+            "\n\n  #pragma omp parallel for\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) { sum += input[i - 1][j]; }\n      if (i < N - 1) { sum += input[i + 1][j]; }\n      if (j > 0) { sum += input[i][j - 1]; }\n      if (j < N - 1) { sum += input[i][j + 1]; }\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n  int rows_per_rank = N / size;\n  int start_row = rank * rows_per_rank;\n  int end_row = start_row + rows_per_rank;\n\n  #pragma omp parallel for\n  for (int i = start_row; i < end_row; i++) {\n    for (int j = 0; j < N; j++) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      output[i * N + j] = sum / 5;\n    }\n  }\n  if (rank == 0) {\n    for (int i = 1; i < size; i++) {\n      int start_row = i * rows_per_rank;\n      int end_row = start_row + rows_per_rank;\n      MPI_Recv(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&output[start_row * N], (end_row - start_row) * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, size;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &size);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = (N + size - 1) / size;\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = std::min(start_row + rows_per_rank, N);\n\n  // Compute the local portion of the output\n  std::vector<double> local_output(rows_per_rank * N, 0);\n  #pragma omp parallel for collapse(2)\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i - start_row][j] = sum / 5;\n    }\n  }\n\n  // Gather the local results into the global output on rank 0\n  if (rank == 0) {\n    output.resize(N * N);\n    for (int i = 1; i < size; ++i) {\n      MPI_Recv(&output[i * rows_per_rank * N], rows_per_rank * N, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);\n    }\n  } else {\n    MPI_Send(&local_output[0], rows_per_rank * N, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);\n  }\n}\n",
+            "\n\n  int rank, nprocs;\n  MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);\n\n  // Compute the number of rows each rank is responsible for\n  size_t rows_per_rank = N / nprocs;\n\n  // Compute the starting and ending rows for this rank\n  size_t start_row = rank * rows_per_rank;\n  size_t end_row = start_row + rows_per_rank;\n\n  // Create a buffer to store the results for this rank\n  std::vector<double> local_output(N * rows_per_rank);\n\n  // Compute the Jacobi stencil for this rank\n  #pragma omp parallel for\n  for (size_t i = start_row; i < end_row; ++i) {\n    for (size_t j = 0; j < N; ++j) {\n      double sum = 0;\n      if (i > 0) sum += input[i - 1][j];\n      if (i < N - 1) sum += input[i + 1][j];\n      if (j > 0) sum += input[i][j - 1];\n      if (j < N - 1) sum += input[i][j + 1];\n      sum += input[i][j];\n      local_output[i * N + j] = sum / 5;\n    }\n  }\n\n  // Gather the results from all ranks into output on rank 0\n  MPI_Gather(local_output.data(), N * rows_per_rank, MPI_DOUBLE, output.data(), N * rows_per_rank, MPI_DOUBLE, 0, MPI_COMM_WORLD);\n}\n"
+        ]
+    }
+]
\ No newline at end of file